diff --git a/examples/iouring/source/app.d b/examples/iouring/source/app.d new file mode 100644 index 0000000..005fa50 --- /dev/null +++ b/examples/iouring/source/app.d @@ -0,0 +1,44 @@ +import std.stdio : writeln; +import concurrency; +import concurrency.socket; +import concurrency.io; +import concurrency.operations.whenall; + +void main() @safe +{ + auto fd = listen("127.0.0.1", 0); + auto io = IOUringContext.construct(256); + auto socket = getSocket(); + auto port = fd.getPort(); + + writeln("Listening on 127.0.0.1:", port); + + writeln("Connecting..."); + auto client = io.run( + whenAll( + io.accept(fd), + io.connect(socket, "127.0.0.1", port), + ) + ).syncWait().value[0]; + + + ubyte[4] bufferRead; + ubyte[4] bufferSend; + bufferSend[0..4] = [1,2,3,4]; + + writeln("Transmitting..."); + auto result = io.run( + whenAll( + io.write(socket, bufferSend[]), + io.read(client.fd, bufferRead[]), + ) + ).syncWait().value[1]; + + + writeln("Closing..."); + closeSocket(client.fd); + closeSocket(socket); + closeSocket(fd); + + writeln("Got: ", result); +} diff --git a/source/concurrency/io/iouring.d b/source/concurrency/io/iouring.d new file mode 100644 index 0000000..ca40bc1 --- /dev/null +++ b/source/concurrency/io/iouring.d @@ -0,0 +1,715 @@ +module concurrency.io.iouring; + +version(linux): + +import concurrency.data.queue.mpsc; +import concurrency.stoptoken; +import concurrency.receiver : setErrno, setValueOrError; +import during; +import core.stdc.errno : ECANCELED; + +private struct Queue(Node) { + import concurrency.data.queue.mpsc : Chain; + Node* head, tail; + + bool empty() { + return tail is null; + } + + Node* pop() { + auto res = tail; + tail = tail.next; + return res; + } + + void append(Chain!(Node) chain) { + if (empty) { + head = chain.head; + tail = chain.tail; + } else { + head.next = chain.tail; + head = chain.head; + } + } +} + +struct IOUringContext { + import concurrency.scheduler : Timer, TimingWheels, TimerCommand; + import core.time : msecs, Duration; + import core.thread : ThreadID; + import std.process : thisThreadID; + + private MPSCQueue!(Item) requests; + private MPSCQueue!(Timer) timers; + private Queue!(Item) pending; + private ptrdiff_t totalSubmitted; + private ptrdiff_t newlySubmitted; + private Uring io; + private ubyte[8] buffer; + private int event; + private TimingWheels wheels; + private enum Duration tickSize = 1.msecs; + private ThreadID threadId; + // TODO: instead of using timers and timeout on the iouring_enter + // we could also use IORING_OP_TIMEOUT + // or even IORING_OP_LINK_TIMEOUT to link the timeout to the + // wakeup event + private bool dirtyTimers; + private long nextTimer; + private shared bool needsWakeup; + private this(uint size) @trusted { + // TODO: look into `IORING_SETUP_SQPOLL` for fast submission + import core.sys.linux.sys.eventfd; + io.setup(size, + SetupFlags.SINGLE_ISSUER + // | SetupFlags.DEFER_TASKRUN + // | SetupFlags.COOP_TASKRUN + // | SetupFlags.SQPOLL + ); + event = eventfd(0, EFD_CLOEXEC); + requests = new MPSCQueue!(Item); + timers = new MPSCQueue!(Timer); + wheels.initialize(); + threadId = thisThreadID; + } + + static auto construct(uint size) @safe { + return IOUringContext(size); + } + + private ref assumeThreadSafe() @trusted nothrow shared { + return cast()this; + } + // TODO: can this be make @safe? + // otherwise it might have to be made @system perhaps? + // we are taking a pointer to a ref item, which likely + // sits on the stack. + // logically however, all the lifetimes are correct though + private bool push(ref Item item) @trusted nothrow shared { + int __n = 0; + int __no_new_submissions = 1; + + if (__n == __no_new_submissions) { + CompletionEntry entry; + entry.res = -ECANCELED; + item.complete(entry); + return false; + } else { + if (threadId == thisThreadID) { + with (assumeThreadSafe) { + if (!io.full) { + submitItem(&item); + io.flush(); + return true; + } + } + } + + requests.push(&item); + return true; + } + } + + private void wakeup() @trusted nothrow shared { + import core.sys.posix.unistd; + import core.atomic : atomicLoad, MemoryOrder, cas; + size_t wakeup = 1; + if ((&needsWakeup).cas!(MemoryOrder.raw, MemoryOrder.raw)(true, false)) { + core.sys.posix.unistd.write(event, &wakeup, wakeup.sizeof); + } + } + + import core.time : Duration; + void addTimer(ref Timer timer, Duration dur) @trusted shared { + timer.scheduled_at = dur.split!"hnsecs".hnsecs; + timer.command = TimerCommand.Register; + timers.push(&timer); + wakeup(); + } + + void cancelTimer(ref Timer timer) @trusted shared { + timer.command = TimerCommand.Cancel; + timers.push(&timer); + wakeup(); + } + + auto run(Sender)(Sender sender) @safe nothrow { + return RunSender!(Sender)(&this, sender); + } + + private int run(scope shared StopToken stopToken) @safe nothrow { + import core.atomic : atomicStore, MemoryOrder; + + assert(threadId == thisThreadID, "Thread that started IOUringContext must also drive it."); + pending.append(requests.popAll()); + scheduleTimers(); + + putEventFdChannel(); + while (!stopToken.isStopRequested() || !pending.empty() || !io.empty()) { + putPending(); + + int rc = submitAndWait(); + // TODO: return without completing all pending or completed requests + // will result in blocked request. Instead we need to cancel all requests + // until the stopToken is triggered. + // Would it be possible to cancel the whole context in one go? + if (rc < 0) + return -rc; + atomicStore!(MemoryOrder.raw)(needsWakeup, false); + + completeTimers(); + popCompleted(); + scheduleTimers(); + + atomicStore!(MemoryOrder.raw)(needsWakeup, true); + pending.append(requests.popAll()); + } + + return 0; + } + + private void scheduleTimers() @safe nothrow { + import std.datetime.systime : Clock; + import core.time : hnsecs; + import concurrency.scheduler : TimerTrigger; + + Queue!(Timer) items; + items.append(timers.popAll()); + + if (!items.empty) + dirtyTimers = true; + + while (!items.empty) { + auto timer = items.pop(); + + if (timer.command == TimerCommand.Register) { + auto real_now = Clock.currStdTime; + auto tw_now = wheels.currStdTime(tickSize); + auto delay = (real_now - tw_now).hnsecs; + auto at = (timer.scheduled_at.hnsecs + delay) / tickSize; + wheels.schedule(timer, at); + } else { + wheels.cancel(timer); + timer.userdata(TimerTrigger.cancel); + } + } + } + + private void completeTimers() @safe nothrow { + import std.datetime.systime : Clock; + import concurrency.scheduler : TimerTrigger; + + int incr = wheels.ticksToCatchUp(tickSize, Clock.currStdTime); + if (incr > 0) { + Timer* t; + wheels.advance(incr, t); + if (t !is null) + dirtyTimers = true; + while (t !is null) { + auto next = t.next; + t.userdata(TimerTrigger.trigger); + t = next; + } + } + } + + import std.typecons : Nullable; + private Nullable!Duration timeUntilNextTimer() @safe nothrow { + import std.datetime.systime : Clock; + import core.time : hnsecs; + + long now = Clock.currStdTime; + if (dirtyTimers) { + dirtyTimers = false; + auto nextTriggerOpt = wheels.timeUntilNextEvent(tickSize, now); + if (nextTriggerOpt.isNull) { + nextTimer = 0; + return typeof(return).init; + } + nextTimer = now + nextTriggerOpt.get.split!"hnsecs".hnsecs; + return nextTriggerOpt; + } else if (nextTimer != 0) { + return typeof(return)((nextTimer - now).hnsecs); + } else { + return typeof(return).init; + } + } + + private int submitAndWait() @safe nothrow { + import std.datetime.systime : Clock; + + auto nextTriggerOpt = timeUntilNextTimer(); + + if (!nextTriggerOpt.isNull) { + // next timer is in 0 msecs + if (nextTriggerOpt.get <= 0.msecs) { + // only submit any SubmissionEntries + return io.submit(); + } + + // set io_uring timeout + io_uring_getevents_arg arg; + KernelTimespec timespec; + + auto parts = nextTriggerOpt.get().split!("seconds", "nsecs"); + timespec.tv_sec = parts.seconds; + timespec.tv_nsec = parts.nsecs; + arg.ts = cast(ulong)(cast(void*)×pec); + + return io.submitAndWait(1, &arg); + } + + return io.submitAndWait(1); + } + + private void putEventFdChannel() @safe nothrow { + io.putWith!((ref SubmissionEntry e, IOUringContext* context) { + e.prepRead(context.event, context.buffer[0..8], 0); + })(&this); + } + + private void putPending() @safe nothrow { + while (!pending.empty && !io.full()) { + auto item = pending.pop(); + submitItem(item); + } + } + + private void submitItem(Item* item) @safe nothrow { + SubmissionEntry entry; + if (item.submit(entry)) { + entry.setUserDataRaw(item); + io.put(entry); + } + } + + private void popCompleted() @safe nothrow { + // TODO: to reduce latency, would it help to run submit and complete in a loop? + while (!io.empty()) { + auto entry = io.front(); + auto item = entry.userDataAs!(Item*); + if (item !is null) + item.complete(entry); + else + putEventFdChannel(); + io.popFront(); + } + } +} + +struct RunSender(Sender) { + alias Value = Sender.Value; + IOUringContext* context; + Sender sender; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = RunOp!(Sender, Receiver)(context, sender, receiver); + return op; + } +} + +struct RunOp(Sender, Receiver) { + import concurrency : Cancelled, justFrom, JustFromSender; + import concurrency.sender : OpType; + import concurrency.operations.dofinally; + import concurrency.operations.whenall; + import concurrency.operations.withscheduler; + import concurrency.operations.withioscheduler; + + alias RunSender = JustFromSender!(void delegate() @trusted shared); + alias SenderWithScheduler = WithSchedulerSender!(Sender, IOUringScheduler); + alias SenderWithIOScheduler = WithIOSchedulerSender!(SenderWithScheduler, IOUringScheduler); + alias ValueSender = DoFinallySender!(SenderWithIOScheduler, void delegate() @safe nothrow shared); + alias CombinedSender = WhenAllSender!(ValueSender, RunSender); + alias Op = OpType!(CombinedSender, Receiver); + + IOUringContext* context; + shared StopSource stopSource; + Op op; + + @disable + this(ref return scope typeof(this) rhs); + @disable + this(this); + + this(IOUringContext* context, Sender sender, return Receiver receiver) @trusted return scope { + this.context = context; + shared IOUringContext* sharedContext = cast(shared)context; + auto scheduler = IOUringScheduler(sharedContext); + op = whenAll( + sender.withScheduler(scheduler).withIOScheduler(scheduler).doFinally(() @safe nothrow shared { + stopSource.stop(); + sharedContext.wakeup(); + }), + justFrom(&(cast(shared)this).run), + ).connect(receiver); + } + private void run() @trusted shared { + with(cast()this) { + import std.exception : ErrnoException; + auto token = stopSource.token(); + auto res = context.run(token); + if (res < 0) + throw new ErrnoException("IOUring failed", -res); + } + } + void start() @safe nothrow { + op.start(); + } +} + +struct Item { + // TODO: we are storing 2 this pointers here + bool delegate(ref SubmissionEntry sqe) @safe nothrow submit; + void delegate(ref const CompletionEntry cqe) @safe nothrow complete; + Item* next; +} + +struct CancellableOperation(Operation) { + private shared(IOUringContext)* context; + private Operation operation; + private shared size_t ops; + private shared StopCallback cb; + private Item item; + + @disable + this(ref return scope typeof(this) rhs); + @disable + this(this); + + @disable void opAssign(typeof(this) rhs) nothrow @safe @nogc; + @disable void opAssign(ref typeof(this) rhs) nothrow @safe @nogc; + + this(shared IOUringContext* context, Operation operation) @safe { + this.context = context; + this.operation = operation; + } + + void start() @trusted nothrow scope { + item.submit = &submit; + item.complete = &complete; + if (context.push(item)) { + context.wakeup(); + } + } + + // TODO: shouldn't submit be shared? + private bool submit(ref SubmissionEntry entry) @trusted nothrow { + try { + import core.atomic; + ops.atomicFetchAdd!(MemoryOrder.raw)(1); + auto stopToken = operation.receiver.getStopToken(); + cb.register(stopToken, &(cast(shared)this).onStop); + + operation.submit(entry); + return true; + } catch (Throwable e) { + operation.receiver.setError(e); + return false; + } + } + + private void complete(const ref CompletionEntry entry) @safe nothrow { + import core.atomic; + if (ops.atomicFetchSub!(MemoryOrder.raw)(1) != 1) + return; + + cb.dispose(); + + auto token = operation.receiver.getStopToken(); + if (entry.res == -ECANCELED || token.isStopRequested()) { + operation.receiver.setDone(); + } else { + operation.complete(entry); + } + } + + private void onStop() nothrow @safe shared { + import core.atomic; + size_t expected = 1; + if (cas!(MemoryOrder.raw, MemoryOrder.raw)(&ops, expected, 2)) { + // Note we reuse the original item since submit already happened + // and the userData needs to be the same for the cancellation + // anyway. + with(assumeThreadSafe) { + item.submit = &submitStop; + if (this.context.push(item)) { + this.context.wakeup(); + } + } + } + } + + private bool submitStop(ref SubmissionEntry entry) nothrow @safe { + entry.prepCancel(item); + return true; + } + + private ref assumeThreadSafe() nothrow @trusted shared { + return cast()this; + } +} + +struct IOUringScheduler { + import core.time : Duration; + import std.socket : socket_t; + shared (IOUringContext)* context; + + auto read(socket_t fd, ubyte[] buffer, long offset = 0) @safe nothrow @nogc { + return ReadSender(context, fd, buffer, offset); + } + + auto accept(socket_t fd) @safe nothrow @nogc { + return AcceptSender(context, fd); + } + + auto connect(socket_t fd, string address, ushort port) @safe nothrow @nogc { + return ConnectSender(context, fd, address, port); + } + + auto write(socket_t fd, const(ubyte)[] buffer, long offset = 0) @safe nothrow @nogc { + return WriteSender(context, fd, buffer, offset); + } + + auto close(socket_t fd) @safe nothrow @nogc { + return CloseSender(context, fd); + } + + auto schedule() @safe nothrow @nogc { + import concurrency.scheduler : ScheduleAfterSender; + import core.time : msecs; + return ScheduleAfterSender!(shared IOUringContext*)(context, 0.msecs); + } + + auto scheduleAfter(Duration duration) @safe nothrow @nogc { + import concurrency.scheduler : ScheduleAfterSender; + return ScheduleAfterSender!(shared IOUringContext*)(context, duration); + } +} + +struct ReadSender { + import std.socket : socket_t; + alias Value = ubyte[]; + shared IOUringContext* context; + socket_t fd; + ubyte[] buffer; + long offset; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = CancellableOperation!(ReadOperation!Receiver)( + context, + ReadOperation!(Receiver)(fd, buffer, offset, receiver) + ); + return op; + } +} + +struct ReadOperation(Receiver) { + import std.socket : socket_t; + socket_t fd; + ubyte[] buffer; + long offset; + Receiver receiver; + void submit(ref SubmissionEntry entry) @safe nothrow { + entry.prepRead(fd, buffer, offset); + } + void complete(const ref CompletionEntry entry) @safe nothrow { + if (entry.res > 0) { + receiver.setValueOrError(buffer[offset..entry.res]); + } else if (entry.res == 0) { + receiver.setDone(); + } else { + receiver.setErrno("Read failed", -entry.res); + } + } +} + +struct AcceptSender { + import concurrency.ioscheduler : Client; + import std.socket : socket_t; + alias Value = Client; + shared IOUringContext* context; + socket_t fd; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = CancellableOperation!(AcceptOperation!Receiver)( + context, + AcceptOperation!(Receiver)(fd, receiver) + ); + return op; + } +} + +struct AcceptOperation(Receiver) { + import core.sys.posix.sys.socket : sockaddr, socklen_t; + import core.sys.posix.netinet.in_; + import concurrency.ioscheduler : Client; + import std.socket : socket_t; + + socket_t fd; + Receiver receiver; + sockaddr addr; + socklen_t addrlen; + void submit(ref SubmissionEntry entry) @safe nothrow { + entry.prepAccept(fd, addr, addrlen); + } + void complete(const ref CompletionEntry entry) @safe nothrow { + import std.socket : socket_t; + if (entry.res >= 0) { + receiver.setValueOrError(Client(cast(socket_t)entry.res, addr, addrlen)); + } else { + receiver.setErrno("Accept failed", -entry.res); + } + } +} + +struct ConnectSender { + import std.socket : socket_t; + alias Value = socket_t; + shared IOUringContext* context; + socket_t fd; + string address; + ushort port; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = CancellableOperation!(ConnectOperation!Receiver)( + context, + ConnectOperation!(Receiver)(fd, address, port, receiver) + ); + return op; + } +} + +struct ConnectOperation(Receiver) { + import core.sys.posix.sys.socket; + import std.socket : socket_t; + version(Windows) { + import core.sys.windows.windows; + } else version(Posix) { + import core.sys.posix.netinet.in_; + } + socket_t fd; + Receiver receiver; + sockaddr_in addr; + this(socket_t fd, string address, ushort port, Receiver receiver) @trusted { + this.fd = fd; + this.receiver = receiver; + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + + import std.string : toStringz; + uint uiaddr = ntohl(inet_addr(address.toStringz())); + if (INADDR_NONE == uiaddr) { + throw new Exception( + "bad listening host given, please use an IP address." + ); + } + + addr.sin_addr.s_addr = htonl(uiaddr); + } + void submit(ref SubmissionEntry entry) @safe nothrow { + entry.prepConnect(fd, addr); + } + void complete(const ref CompletionEntry entry) @safe nothrow { + if (entry.res >= 0) { + receiver.setValueOrError(cast(socket_t)entry.res); + } else { + receiver.setErrno("Connect failed", -entry.res); + } + } +} + +struct WriteSender { + import std.socket : socket_t; + alias Value = int; + shared IOUringContext* context; + socket_t fd; + const(ubyte)[] buffer; + long offset; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = CancellableOperation!(WriteOperation!Receiver)( + context, + WriteOperation!(Receiver)(fd, buffer, offset, receiver) + ); + return op; + } +} + +struct WriteOperation(Receiver) { + import std.socket : socket_t; + socket_t fd; + const(ubyte)[] buffer; + long offset; + Receiver receiver; + void submit(ref SubmissionEntry entry) @safe nothrow { + entry.prepWrite(fd, buffer, offset); + } + void complete(const ref CompletionEntry entry) @safe nothrow { + if (entry.res > 0) { + receiver.setValueOrError(entry.res); + } else if (entry.res == 0) { + receiver.setDone(); + } else { + receiver.setErrno("Write failed", -entry.res); + } + } +} + +struct CloseSender { + import std.socket : socket_t; + alias Value = void; + shared IOUringContext* context; + socket_t fd; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = CancellableOperation!(CloseOperation!Receiver)( + context, + CloseOperation!(Receiver)(fd, receiver) + ); + return op; + } +} + +struct CloseOperation(Receiver) { + import std.socket : socket_t; + socket_t fd; + Receiver receiver; + void submit(ref SubmissionEntry entry) @safe nothrow { + entry.prepClose(fd); + } + void complete(const ref CompletionEntry entry) @safe nothrow { + if (entry.res >= 0) { + receiver.setValueOrError(); + } else { + receiver.setErrno("Close failed", -entry.res); + } + } +} + +struct NopSender { + alias Value = int; + shared IOUringContext* context; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = CancellableOperation!(NopOperation!Receiver)( + context, + NopOperation!(Receiver)(receiver) + ); + return op; + } +} + +struct NopOperation(Receiver) { + Receiver receiver; + void submit(ref SubmissionEntry entry) @safe nothrow { + entry.prepNop(); + } + void complete(const ref CompletionEntry entry) @safe nothrow { + if (entry.res >= 0) { + receiver.setValueOrError(entry.res); + } else { + receiver.setErrno("Nop failed", -entry.res); + } + } +} diff --git a/source/concurrency/io/package.d b/source/concurrency/io/package.d new file mode 100644 index 0000000..25ff857 --- /dev/null +++ b/source/concurrency/io/package.d @@ -0,0 +1,86 @@ +module concurrency.io; + +import concurrency.io.iouring; +import concurrency.ioscheduler : Client; + +import std.socket : socket_t; + +version (linux) + alias IOContext = IOUringContext; + +auto readAsync(socket_t fd, ubyte[] buffer, long offset = 0) @safe nothrow @nogc { + return ReadAsyncSender(fd, buffer, offset); +} + +struct ReadAsyncSender { + alias Value = ubyte[]; + socket_t fd; + ubyte[] buffer; + long offset; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = receiver.getIOScheduler().read(fd, buffer, offset).connect(receiver); + return op; + } +} + +auto acceptAsync(socket_t fd) @safe nothrow @nogc { + return AcceptAsyncSender(fd); +} + +struct AcceptAsyncSender { + alias Value = Client; + socket_t fd; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = receiver.getIOScheduler().accept(fd).connect(receiver); + return op; + } +} + +auto connectAsync(socket_t fd, string address, ushort port) @safe nothrow @nogc { + return ConnectAsyncSender(fd, address, port); +} + +struct ConnectAsyncSender { + import std.socket : socket_t; + alias Value = socket_t; + socket_t fd; + string address; + ushort port; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = receiver.getIOScheduler().connect(fd, address, port).connect(receiver); + return op; + } +} + +auto writeAsync(socket_t fd, const(ubyte)[] buffer, long offset = 0) @safe nothrow @nogc { + return WriteAsyncSender(fd, buffer, offset); +} + +struct WriteAsyncSender { + alias Value = int; + socket_t fd; + const(ubyte)[] buffer; + long offset; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = receiver.getIOScheduler().write(fd, buffer, offset).connect(receiver); + return op; + } +} + +auto closeAsync(socket_t fd) @safe nothrow @nogc { + return CloseAsyncSender(fd); +} + +struct CloseAsyncSender { + alias Value = void; + socket_t fd; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + // ensure NRVO + auto op = receiver.getIOScheduler().close(fd).connect(receiver); + return op; + } +} diff --git a/source/concurrency/io/socket.d b/source/concurrency/io/socket.d new file mode 100644 index 0000000..c9efff9 --- /dev/null +++ b/source/concurrency/io/socket.d @@ -0,0 +1,124 @@ +module concurrency.io.socket; +import std.socket : socket_t; + +auto tcpSocket() @trusted { + import std.socket : socket_t; + version(Windows) { + import core.sys.windows.windows; + } else version(Posix) { + import core.sys.posix.unistd; + import core.sys.posix.sys.socket; + import core.sys.posix.netinet.in_; + import core.sys.posix.sys.wait; + import core.sys.posix.sys.select; + import core.sys.posix.netinet.tcp; + } + + version(linux) { + import core.sys.linux.sys.eventfd; + enum SOCK_NONBLOCK = 0x800; + socket_t sock = cast(socket_t) socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, 0); + } else version(Windows) { + socket_t sock = cast(socket_t) socket(AF_INET, SOCK_STREAM, 0); + uint nonblocking_long = 1; + if (ioctlsocket(sock, FIONBIO, &nonblocking_long) == SOCKET_ERROR) + throw new Exception("ioctlsocket failed"); + } + + if (sock == -1) + throw new Exception("socket"); + + int on = 1; + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &on, on.sizeof); + setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &on, on.sizeof); + version(Posix) // on windows REUSEADDR includes REUSEPORT + setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, &on, on.sizeof); + + return sock; +} + +auto listenTcp(string address = "", ushort port = 0, int backlog = 128) @trusted { + import core.stdc.stdio : fprintf, stderr; + import std.socket : socket_t; + version(Windows) { + import core.sys.windows.windows; + } else version(Posix) { + import core.sys.posix.unistd; + import core.sys.posix.sys.socket; + import core.sys.posix.netinet.in_; + import core.sys.posix.sys.wait; + import core.sys.posix.sys.select; + import core.sys.posix.netinet.tcp; + } + + version(linux) { + import core.sys.linux.sys.eventfd; + } + import core.stdc.errno; + + socket_t sock = tcpSocket(); + + sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + + if (address.length) { + import std.string : toStringz; + uint uiaddr = ntohl(inet_addr(address.toStringz())); + if (INADDR_NONE == uiaddr) { + throw new Exception( + "bad listening host given, please use an IP address.\nExample: --listening-host 127.0.0.1 means listen only on Localhost.\nExample: --listening-host 0.0.0.0 means listen on all interfaces.\nOr you can pass any other single numeric IPv4 address." + ); + } + + addr.sin_addr.s_addr = htonl(uiaddr); + } else + addr.sin_addr.s_addr = INADDR_ANY; + + if (bind(sock, cast(sockaddr*) &addr, addr.sizeof) == -1) { + closeSocket(sock); + throw new Exception("bind"); + } + + if (listen(sock, backlog) == -1) { + closeSocket(sock); + throw new Exception("listen"); + } + + return sock; +} + +auto closeSocket(socket_t sock) @trusted { + import core.sys.posix.unistd; + version(Windows) { + import core.sys.windows.windows; + closesocket(sock); + } else + close(sock); +} + +ushort getPort(socket_t socket) @trusted { + import std.socket; + version(Windows) { + import core.sys.windows.windows; + } else version(Posix) { + import core.sys.posix.unistd; + import core.sys.posix.sys.socket; + import core.sys.posix.netinet.in_; + import core.sys.posix.sys.wait; + import core.sys.posix.sys.select; + import core.sys.posix.netinet.tcp; + } + + version(linux) { + import core.sys.linux.sys.eventfd; + enum SOCK_NONBLOCK = 0x800; + } + + sockaddr_in sin; + socklen_t nameLen = sin.sizeof; + if (Socket.ERROR == getsockname(socket, cast(sockaddr*)&sin, &nameLen)) + throw new SocketOSException("Unable to obtain local socket address"); + + return ntohs(sin.sin_port); +} \ No newline at end of file diff --git a/source/concurrency/ioscheduler.d b/source/concurrency/ioscheduler.d new file mode 100644 index 0000000..de3e07f --- /dev/null +++ b/source/concurrency/ioscheduler.d @@ -0,0 +1,94 @@ +module concurrency.ioscheduler; + +import concurrency.sender : SenderObjectBase, isSender; +import core.time : Duration; +import concepts; +import std.typecons : Nullable, nullable; + +void checkIOScheduler(T)() { + import concurrency.sender : checkSender; + import core.time : msecs; + import std.traits : ReturnType; + alias ReadSender = ReturnType!(T.read); + checkSender!ReadSender(); + // TODO: add other function checks +} + +enum isIOScheduler(T) = is(typeof(checkIOScheduler!T)); + +struct Client { + import std.socket : socket_t; + version(Windows) { + import core.sys.windows.windows : sockaddr, socklen_t; + } else version(Posix) { + import core.sys.posix.sys.socket : sockaddr, socklen_t; + } + + socket_t fd; + sockaddr addr; + socklen_t addrlen; +} + +/// polymorphic IOScheduler +interface IOSchedulerObjectBase { + import std.socket : socket_t; + // TODO: read/write/close aren't just for sockets really + SenderObjectBase!(ubyte[]) read(socket_t fd, return ubyte[] buffer, long offset = 0) @safe; + SenderObjectBase!(Client) accept(socket_t fd) @safe; + SenderObjectBase!(socket_t) connect(socket_t fd, return string address, ushort port) @safe; + SenderObjectBase!(int) write(socket_t fd, return const(ubyte)[] buffer, long offset = 0) @safe; + SenderObjectBase!(void) close(socket_t fd) @safe; +} + +struct NullIOScheduler { + import std.socket : socket_t; + import concurrency.sender : ValueSender; + + string errorMsg; + + ValueSender!(ubyte[]) read(socket_t fd, return ubyte[] buffer, long offset = 0) @safe { + throw new Exception(errorMsg); + } + ValueSender!(Client) accept(socket_t fd) @safe { + throw new Exception(errorMsg); + } + ValueSender!(socket_t) connect(socket_t fd, return string address, ushort port) @safe { + throw new Exception(errorMsg); + } + ValueSender!(int) write(socket_t fd, return const(ubyte)[] buffer, long offset = 0) @safe { + throw new Exception(errorMsg); + } + ValueSender!(void) close(socket_t fd) @safe { + throw new Exception(errorMsg); + } +} + +class IOSchedulerObject(S) : IOSchedulerObjectBase { + import concurrency.sender : toSenderObject; + S scheduler; + this(S scheduler) { + this.scheduler = scheduler; + } + + SenderObjectBase!(ubyte[]) read(socket_t fd, return ubyte[] buffer, long offset = 0) @safe { + return scheduler.read(fd, buffer, offset).toSenderObject(); + } + SenderObjectBase!(Client) accept(socket_t fd) @safe { + return scheduler.accept(fd).toSenderObject(); + } + // TODO: is trusted because of scope string address + SenderObjectBase!(socket_t) connect(socket_t fd, return string address, ushort port) @trusted { + string adr = address; + return scheduler.connect(fd, adr, port).toSenderObject(); + } + SenderObjectBase!(int) write(socket_t fd, return const(ubyte)[] buffer, long offset = 0) @safe { + return scheduler.write(fd, buffer, offset).toSenderObject(); + } + SenderObjectBase!(void) close(socket_t fd) @safe { + return scheduler.close(fd).toSenderObject(); + } +} + +IOSchedulerObjectBase toIOSchedulerObject(S)(S scheduler) { + return new IOSchedulerObject!(S)(scheduler); +} diff --git a/source/concurrency/operations/oncompletion.d b/source/concurrency/operations/oncompletion.d index 93c13a5..97780af 100644 --- a/source/concurrency/operations/oncompletion.d +++ b/source/concurrency/operations/oncompletion.d @@ -17,6 +17,7 @@ private struct OnCompletionReceiver(Value, SideEffect, Receiver) { Receiver receiver; SideEffect sideEffect; static if (is(Value == void)) + // TODO: mustn't this be nothrow? void setValue() @safe { sideEffect(); receiver.setValue(); diff --git a/source/concurrency/operations/withioscheduler.d b/source/concurrency/operations/withioscheduler.d new file mode 100644 index 0000000..8b53981 --- /dev/null +++ b/source/concurrency/operations/withioscheduler.d @@ -0,0 +1,52 @@ +module concurrency.operations.withioscheduler; + +import concurrency; +import concurrency.receiver; +import concurrency.sender; +import concurrency.stoptoken; +import concepts; +import std.traits; + +auto withIOScheduler(Sender, IOScheduler)(Sender sender, IOScheduler ioScheduler) { + return WithIOSchedulerSender!(Sender, IOScheduler)(sender, ioScheduler); +} + +private struct WithIOSchedulerReceiver(Receiver, Value, IOScheduler) { + Receiver receiver; + IOScheduler ioScheduler; + static if (is(Value == void)) { + void setValue() @safe { + receiver.setValue(); + } + } else { + void setValue(Value value) @safe { + receiver.setValue(value); + } + } + + void setDone() @safe nothrow { + receiver.setDone(); + } + + void setError(Throwable e) @safe nothrow { + receiver.setError(e); + } + + auto getIOScheduler() @safe nothrow { + return ioScheduler; + } + + mixin ForwardExtensionPoints!receiver; +} + +struct WithIOSchedulerSender(Sender, IOScheduler) if (models!(Sender, isSender)) { + alias Value = Sender.Value; + Sender sender; + IOScheduler ioScheduler; + auto connect(Receiver)(return Receiver receiver) @safe return scope { + alias R = WithIOSchedulerReceiver!(Receiver, Sender.Value, IOScheduler); + // ensure NRVO + auto op = sender.connect(R(receiver, ioScheduler)); + return op; + } +} diff --git a/source/concurrency/receiver.d b/source/concurrency/receiver.d index 4c7df1a..09ab76c 100644 --- a/source/concurrency/receiver.d +++ b/source/concurrency/receiver.d @@ -25,12 +25,19 @@ mixin template ForwardExtensionPoints(alias receiver) { auto getScheduler() nothrow @safe { return receiver.getScheduler(); } + + static if (__traits(hasMember, receiver, "getIOScheduler")) { + auto getIOScheduler() nothrow @safe { + return receiver.getIOScheduler(); + } + } } /// A polymorphic receiver of type T interface ReceiverObjectBase(T) { import concurrency.stoptoken : StopToken; import concurrency.scheduler : SchedulerObjectBase; + import concurrency.ioscheduler : IOSchedulerObjectBase; static assert(models!(ReceiverObjectBase!T, isReceiver)); static if (is(T == void)) void setValue() @safe; @@ -40,6 +47,7 @@ interface ReceiverObjectBase(T) { void setError(Throwable e) nothrow @safe; shared(StopToken) getStopToken() nothrow @safe; SchedulerObjectBase getScheduler() scope nothrow @safe; + IOSchedulerObjectBase getIOScheduler() scope nothrow @safe; } struct NullReceiver(T) { @@ -96,3 +104,12 @@ void setValueOrError(Receiver, T)(auto ref Receiver receiver, } } } + +void setErrno(Receiver)(ref Receiver receiver, string msg, int n) @safe nothrow { + import std.exception : ErrnoException; + try { + receiver.setError(new ErrnoException(msg, n)); + } catch (Exception e) { + receiver.setError(e); + } +} diff --git a/source/concurrency/scheduler.d b/source/concurrency/scheduler.d index 08292f6..d068b42 100644 --- a/source/concurrency/scheduler.d +++ b/source/concurrency/scheduler.d @@ -23,33 +23,6 @@ interface SchedulerObjectBase { SenderObjectBase!void scheduleAfter(Duration d) @safe; } - -// We can pull the LocalThreadExecutor (and its schedule/scheduleAfter) out into a specialized context. -// Just like we did with the iouring context - -// The interesting bit is that the syncWait algorithm then might be inferred as @nogc - -// The question remains how we would want to integrate these. -// With iouring we created a runner that would take a sender and would inject the scheduler and allow itself to steal the current thread. - -// That last part is important, we don't want to spawn a thread just to run timers, we can do it perfectly fine on the current thread. -// Same with iouring or other event loops. - -// That said, we can, if we want to, move the event loop to another thread. - -// The only thing we can't do is cross schedule timers from one thread to another. -// Well, that is not true, we can create two context objects that expose a Scheduler - - - - - - -// Guess we just have to write it and see.... - -// Dietmar Kuhl used a iocontext with a run function that allows running it on the current thread. -// In rant I had the iocontext's runner return a sender so you could await that. - class SchedulerObject(S) : SchedulerObjectBase { import concurrency.sender : toSenderObject; S scheduler; @@ -77,10 +50,13 @@ enum TimerTrigger { cancel } -alias TimerDelegate = void delegate(TimerTrigger) @safe shared; +alias TimerDelegate = void delegate(TimerTrigger) @safe shared nothrow; import concurrency.timingwheels : ListElement; +static import concurrency.timingwheels; alias Timer = ListElement!(TimerDelegate); +alias TimingWheels = concurrency.timingwheels.TimingWheels!(TimerDelegate); +public import concurrency.timingwheels : TimerCommand; auto localThreadScheduler() { import concurrency.thread : LocalThreadWorker, getLocalThreadExecutor; @@ -179,7 +155,7 @@ struct ScheduleAfterOp(Worker, Receiver) { stopCb.register(token, cast(void delegate() nothrow @safe shared) &stop); try { - timer.userdata = cast(void delegate(TimerTrigger) @safe shared) &trigger; + timer.userdata = cast(void delegate(TimerTrigger) @safe shared nothrow) &trigger; worker.addTimer(timer, dur); } catch (Exception e) { receiver.setError(e); @@ -254,14 +230,13 @@ struct ManualTimeScheduler { } class ManualTimeWorker { - import concurrency.timingwheels : TimingWheels; import concurrency.executor : VoidDelegate; import core.sync.mutex : Mutex; import core.sync.condition : Condition; import core.time : msecs, hnsecs; import std.array : Appender; private { - TimingWheels!TimerDelegate wheels; + TimingWheels wheels; Condition condition; size_t time = 1; shared ulong nextTimerId; diff --git a/source/concurrency/sender.d b/source/concurrency/sender.d index 0c12b90..d556c7c 100644 --- a/source/concurrency/sender.d +++ b/source/concurrency/sender.d @@ -42,7 +42,7 @@ import core.time : Duration; /// checks that T is a Sender void checkSender(T)() @safe { - import concurrency.scheduler : SchedulerObjectBase; + import concurrency.ioscheduler : NullIOScheduler; import concurrency.stoptoken : StopToken; T t = T.init; struct Scheduler { @@ -75,6 +75,10 @@ void checkSender(T)() @safe { Scheduler getScheduler() @safe nothrow { return Scheduler.init; } + + NullIOScheduler getIOScheduler() @safe nothrow { + return NullIOScheduler("Testing NullIOScheduler"); + } } scope receiver = Receiver.init; @@ -225,10 +229,12 @@ template toReceiverObject(T) { import concurrency.receiver; import concurrency.stoptoken : StopToken; import concurrency.scheduler : SchedulerObjectBase; + import concurrency.ioscheduler : IOSchedulerObjectBase; return new class(receiver) ReceiverObjectBase!T { Receiver receiver; SchedulerObjectBase scheduler; + IOSchedulerObjectBase ioScheduler; this(Receiver receiver) { this.receiver = receiver; } @@ -262,6 +268,21 @@ template toReceiverObject(T) { } return scheduler; } + + IOSchedulerObjectBase getIOScheduler() nothrow @safe scope { + import concurrency.ioscheduler : toIOSchedulerObject; + static if (__traits(hasMember, receiver, "getIOScheduler")) { + if (ioScheduler is null) { + ioScheduler = receiver.getIOScheduler().toIOSchedulerObject; + } + } else { + import concurrency.ioscheduler : NullIOScheduler; + if (ioScheduler is null) { + ioScheduler = NullIOScheduler("Type " ~Receiver.stringof ~ " doesn't have IOScheduler.").toIOSchedulerObject; + } + } + return ioScheduler; + } }; } } diff --git a/source/concurrency/thread.d b/source/concurrency/thread.d index dbe5948..5e57330 100644 --- a/source/concurrency/thread.d +++ b/source/concurrency/thread.d @@ -84,7 +84,6 @@ class LocalThreadExecutor : Executor { import core.thread : ThreadID; import std.process : thisThreadID; import concurrency.scheduler : Timer; - import concurrency.timingwheels; static struct Node { VoidDelegate dg; @@ -94,7 +93,7 @@ class LocalThreadExecutor : Executor { private { ThreadID threadId; WorkQueue queue; - TimingWheels!TimerDelegate wheels; + TimingWheels wheels; shared ulong nextTimerId; } diff --git a/source/concurrency/timingwheels.d b/source/concurrency/timingwheels.d index bd37dda..7862a1d 100644 --- a/source/concurrency/timingwheels.d +++ b/source/concurrency/timingwheels.d @@ -127,9 +127,15 @@ struct ListElement(T) { T userdata; ulong scheduled_at; ushort position = 0xffff; + TimerCommand command; ListElement!T* prev, next; } +enum TimerCommand : ushort { + Register = 1, + Cancel = 2, +} + struct TimingWheels(T) { import core.bitop : bsr; diff --git a/source/during/io_uring.d b/source/during/io_uring.d new file mode 100644 index 0000000..9ad0188 --- /dev/null +++ b/source/during/io_uring.d @@ -0,0 +1,1742 @@ +/** + * io_uring system api definitions. + * + * See: https://github.com/torvalds/linux/blob/master/include/uapi/linux/io_uring.h + * + * Last changes from: bdb2c48e4b38e6dbe82533b437468999ba3ae498 (20220708) + */ +module during.io_uring; + +version (linux): + +import core.sys.posix.poll; +import core.sys.posix.signal; + +nothrow @nogc: + +/** + * IO operation submission data structure (Submission queue entry). + * + * C API: `struct io_uring_sqe` + */ +struct SubmissionEntry +{ + Operation opcode; /// type of operation for this sqe + SubmissionEntryFlags flags; /// IOSQE_ flags + ushort ioprio; /// ioprio for the request + int fd; /// file descriptor to do IO on + union + { + ulong off; /// offset into file + ulong addr2; /// from Linux 5.5 + + struct + { + uint cmd_op; /// from Linux 5.19 + uint __pad1; + } + } + + union + { + ulong addr; /// pointer to buffer or iovecs + ulong splice_off_in; + } + uint len; /// buffer size or number of iovecs + + union + { + ReadWriteFlags rw_flags; + FsyncFlags fsync_flags; + ushort poll_events; /// Unused from 5.9, kept for compatibility reasons - see https://github.com/torvalds/linux/commit/5769a351b89cd4d82016f18fa5f6c4077403564d + PollEvents poll_events32; /// from Linux 5.9 - word-reversed for BE + SyncFileRangeFlags sync_range_flags; /// from Linux 5.2 + MsgFlags msg_flags; /// from Linux 5.3 + TimeoutFlags timeout_flags; /// from Linux 5.4 + AcceptFlags accept_flags; /// from Linux 5.5 + CancelFlags cancel_flags; /// from Linux 5.5 + uint open_flags; /// from Linux 5.6 + uint statx_flags; /// from Linux 5.6 + uint fadvise_advice; /// from Linux 5.6 + uint splice_flags; /// from Linux 5.7 + uint rename_flags; /// from Linux 5.11 + uint unlink_flags; /// from Linux 5.11 + uint hardlink_flags; /// from Linux 5.15 + uint xattr_flags; /// from Linux 5.19 + uint msg_ring_flags; /// from Linux 6.0 + } + + ulong user_data; /// data to be passed back at completion time + + union + { + align (1): + ushort buf_index; /// index into fixed buffers, if used + ushort buf_group; /// for grouped buffer selection + } + + ushort personality; /// personality to use, if used + union + { + int splice_fd_in; + uint file_index; + } + + union + { + struct + { + ulong addr3; + ulong[1] __pad2; + } + /* + * If the ring is initialized with `IORING_SETUP_SQE128`, then + * this field is used for 80 bytes of arbitrary command data + */ + ubyte[0] cmd; + } + + /// Resets entry fields + void clear() @safe nothrow @nogc + { + this = SubmissionEntry.init; + } +} + +/* + * If sqe->file_index is set to this for opcodes that instantiate a new direct descriptor (like + * openat/openat2/accept), then io_uring will allocate an available direct descriptor instead of + * having the application pass one in. The picked direct descriptor will be returned in cqe->res, or + * `-ENFILE` if the space is full. + * + * Note: since Linux 5.19 + */ +enum IORING_FILE_INDEX_ALLOC = ~0U; + +enum ReadWriteFlags : int +{ + NONE = 0, + + /// High priority read/write. Allows block-based filesystems to + /// use polling of the device, which provides lower latency, but + /// may use additional resources. (Currently, this feature is + /// usable only on a file descriptor opened using the + /// O_DIRECT flag.) + /// + /// (since Linux 4.6) + HIPRI = 0x00000001, + + /// Provide a per-write equivalent of the O_DSYNC open(2) flag. + /// This flag is meaningful only for pwritev2(), and its effect + /// applies only to the data range written by the system call. + /// + /// (since Linux 4.7) + DSYNC = 0x00000002, + + /// Provide a per-write equivalent of the O_SYNC open(2) flag. + /// This flag is meaningful only for pwritev2(), and its effect + /// applies only to the data range written by the system call. + /// + /// (since Linux 4.7) + SYNC = 0x00000004, + + /// Do not wait for data which is not immediately available. If + /// this flag is specified, the preadv2() system call will + /// return instantly if it would have to read data from the + /// backing storage or wait for a lock. If some data was + /// successfully read, it will return the number of bytes read. + /// If no bytes were read, it will return -1 and set errno to + /// EAGAIN. Currently, this flag is meaningful only for + /// preadv2(). + /// + /// (since Linux 4.14) + NOWAIT = 0x00000008, + + /// Provide a per-write equivalent of the O_APPEND open(2) flag. + /// This flag is meaningful only for pwritev2(), and its effect + /// applies only to the data range written by the system call. + /// The offset argument does not affect the write operation; the + /// data is always appended to the end of the file. However, if + /// the offset argument is -1, the current file offset is + /// updated. + /// + /// (since Linux 4.16) + APPEND = 0x00000010 +} + +enum FsyncFlags : uint +{ + /// Normal file integrity sync + NORMAL = 0, + + /** + * `fdatasync` semantics. + * + * See_Also: `fsync(2)` for details + */ + DATASYNC = (1 << 0) +} + +/** Possible poll event flags. + * See: poll(2) + */ +enum PollEvents : uint +{ + NONE = 0, + + /// There is data to read. + IN = POLLIN, + + /** Writing is now possible, though a write larger that the available + * space in a socket or pipe will still block (unless O_NONBLOCK is set). + */ + OUT = POLLOUT, + + /** There is some exceptional condition on the file descriptor. + * Possibilities include: + * + * * There is out-of-band data on a TCP socket (see tcp(7)). + * * A pseudoterminal master in packet mode has seen a state + * change on the slave (see ioctl_tty(2)). + * * A cgroup.events file has been modified (see cgroups(7)). + */ + PRI = POLLPRI, + + /** Error condition (only returned in revents; ignored in events). + * This bit is also set for a file descriptor referring to the + * write end of a pipe when the read end has been closed. + */ + ERR = POLLERR, + + /// Invalid request: fd not open (only returned in revents; ignored in events). + NVAL = POLLNVAL, + + RDNORM = POLLRDNORM, /// Equivalent to POLLIN. + RDBAND = POLLRDBAND, /// Priority band data can be read (generally unused on Linux). + WRNORM = POLLWRNORM, /// Equivalent to POLLOUT. + WRBAND = POLLWRBAND, /// Priority data may be written. + + /** Hang up (only returned in revents; ignored in events). Note + * that when reading from a channel such as a pipe or a stream + * socket, this event merely indicates that the peer closed its + * end of the channel. Subsequent reads from the channel will + * return 0 (end of file) only after all outstanding data in the + * channel has been consumed. + */ + HUP = POLLHUP, + + /** (since Linux 2.6.17) + * Stream socket peer closed connection, or shut down writing half of connection. + */ + RDHUP = 0x2000, + + /** (since Linux 4.5) + * Sets an exclusive wakeup mode for the epoll file descriptor that is being attached to the + * target file descriptor, fd. When a wakeup event occurs and multiple epoll file descriptors + * are attached to the same target file using EPOLLEXCLUSIVE, one or more of the epoll file + * descriptors will receive an event with epoll_wait(2). The default in this scenario (when + * EPOLLEXCLUSIVE is not set) is for all epoll file descriptors to receive an event. + * EPOLLEXCLUSIVE is thus useful for avoiding thundering herd problems in certain scenarios. + */ + EXCLUSIVE = 0x10000000, +} + +/** + * Flags for `sync_file_range(2)` operation. + * + * See_Also: `sync_file_range(2)` for details + */ +enum SyncFileRangeFlags : uint +{ + NOOP = 0, /// no operation + /// Wait upon write-out of all pages in the specified range that have already been submitted to + /// the device driver for write-out before performing any write. + WAIT_BEFORE = 1U << 0, + + /// Initiate write-out of all dirty pages in the specified range which are not presently + /// submitted write-out. Note that even this may block if you attempt to write more than + /// request queue size. + WRITE = 1U << 1, + + /// Wait upon write-out of all pages in the range after performing any write. + WAIT_AFTER = 1U << 2, + + /// This is a write-for-data-integrity operation that will ensure that all pages in the + /// specified range which were dirty when sync_file_range() was called are committed to disk. + WRITE_AND_WAIT = WAIT_BEFORE | WRITE | WAIT_AFTER +} + +/** + * Flags for `sendmsg(2)` and `recvmsg(2)` operations. + * + * See_Also: man pages for the operations. + */ +enum MsgFlags : uint +{ + /// No flags defined + NONE = 0, + + /// Sends out-of-band data on sockets that support this notion (e.g., of type `SOCK_STREAM`); the + /// underlying protocol must also support out-of-band data. + OOB = 0x01, + + /// This flag causes the receive operation to return data from the beginning of the receive + /// queue without removing that data from the queue. Thus, a subsequent receive call will return + /// the same data. + PEEK = 0x02, + + /// Don't use a gateway to send out the packet, send to hosts only on directly connected + /// networks. This is usually used only by diagnostic or routing programs. This is defined only + /// for protocol families that route; packet sockets don't. + DONTROUTE = 0x04, + + /// For raw (`AF_PACKET`), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux + /// 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet + /// or datagram, even when it was longer than the passed buffer. + /// + /// For use with Internet stream sockets, see `tcp(7)`. + TRUNC = 0x20, + + /// Enables nonblocking operation; if the operation would block, EAGAIN or EWOULDBLOCK is + /// returned. This provides similar behavior to setting the O_NONBLOCK flag (via the `fcntl(2)` + /// F_SETFL operation), but differs in that `MSG_DONTWAIT` is a per-call option, whereas + /// `O_NONBLOCK` is a setting on the open file description (see `open(2)`), which will affect + /// all threads in the calling process and as well as other processes that hold file descriptors + /// referring to the same open file description. + DONTWAIT = 0x40, + + /// Terminates a record (when this notion is supported, as for sockets of type `SOCK_SEQPACKET`). + EOR = 0x80, + + /// This flag requests that the operation block until the full request is satisfied. However, + /// the call may still return less data than requested if a signal is caught, an error or + /// disconnect occurs, or the next data to be received is of a different type than that + /// returned. This flag has no effect for datagram sockets. + WAITALL = 0x100, + + /// Tell the link layer that forward progress happened: you got a successful reply from the + /// other side. If the link layer doesn't get this it will regularly reprobe the neighbor (e.g., + /// via a unicast ARP). Valid only on SOCK_DGRAM and SOCK_RAW sockets and currently + /// implemented only for IPv4 and IPv6. See arp(7) for details. + CONFIRM = 0x800, + + /// This flag specifies that queued errors should be received from the socket error queue. The + /// error is passed in an ancillary message with a type dependent on the protocol (for IPv4 + /// `IP_RECVERR`). The user should supply a buffer of sufficient size. See `cmsg(3)` and `ip(7)` + /// for more information. The payload of the original packet that caused the error is passed as + /// normal data via msg_iovec. The original destination address of the datagram that caused the + /// error is supplied via `msg_name`. + ERRQUEUE = 0x2000, + + /// Don't generate a `SIGPIPE` signal if the peer on a stream-oriented socket has closed the + /// connection. The `EPIPE` error is still returned. This provides similar behavior to using + /// `sigaction(2)` to ignore `SIGPIPE`, but, whereas `MSG_NOSIGNAL` is a per-call feature, + /// ignoring `SIGPIPE` sets a process attribute that affects all threads in the process. + NOSIGNAL = 0x4000, + + /// The caller has more data to send. This flag is used with TCP sockets to obtain the same + /// effect as the `TCP_CORK` socket option (see `tcp(7)`), with the difference that this flag can be + /// set on a per-call basis. + /// + /// Since Linux 2.6, this flag is also supported for UDP sockets, and informs the kernel to + /// package all of the data sent in calls with this flag set into a single datagram which is + /// transmitted only when a call is performed that does not specify this flag. + /// + /// See_Also: the `UDP_CORK` socket option described in `udp(7)` + MORE = 0x8000, + + /// Set the close-on-exec flag for the file descriptor received via a UNIX domain file + /// descriptor using the `SCM_RIGHTS` operation (described in `unix(7)`). This flag is useful + /// for the same reasons as the `O_CLOEXEC` flag of `open(2)`. (recvmsg only) + CMSG_CLOEXEC = 0x40000000 +} + +/** sqe->timeout_flags + */ +enum TimeoutFlags : uint +{ + REL = 0, /// Relative time is the default + ABS = 1U << 0, /// Absolute time - `IORING_TIMEOUT_ABS` (from Linux 5.5) + + /** + * `IORING_TIMEOUT_UPDATE` (from Linux 5.11) + * + * Support timeout updates through `IORING_OP_TIMEOUT_REMOVE` with passed in `IORING_TIMEOUT_UPDATE`. + */ + UPDATE = 1U << 1, + + /** + * `IORING_TIMEOUT_BOOTTIME` (from Linux 5.15) + */ + BOOTTIME = 1U << 2, + + /** + * `IORING_TIMEOUT_REALTIME` (from Linux 5.15) + */ + REALTIME = 1U << 3, + + /** + * `IORING_LINK_TIMEOUT_UPDATE` (from Linux 5.15) + */ + LINK_TIMEOUT_UPDATE = 1U << 4, + + /** + * `IORING_TIMEOUT_ETIME_SUCCESS` (from Linux 5.16) + */ + TIMEOUT_ETIME_SUCCESS = 1U << 5, + + /** + * `IORING_TIMEOUT_CLOCK_MASK` (from Linux 5.15) + */ + CLOCK_MASK = BOOTTIME | REALTIME, + + /** + * `IORING_TIMEOUT_UPDATE_MASK` (from Linux 5.15) + */ + UPDATE_MASK = UPDATE | LINK_TIMEOUT_UPDATE, +} + +/** + * sqe->splice_flags + * extends splice(2) flags + */ +enum SPLICE_F_FD_IN_FIXED = 1U << 31; /* the last bit of __u32 */ + +/** + * POLL_ADD flags + * + * Note that since sqe->poll_events is the flag space, the command flags for POLL_ADD are stored in + * sqe->len. + */ +enum PollFlags : uint +{ + NONE = 0, + + /** + * `IORING_POLL_ADD_MULTI` - Multishot poll. Sets `IORING_CQE_F_MORE` if the poll handler will + * continue to report CQEs on behalf of the same SQE. + * + * The default io_uring poll mode is one-shot, where once the event triggers, the poll command + * is completed and won't trigger any further events. If we're doing repeated polling on the + * same file or socket, then it can be more efficient to do multishot, where we keep triggering + * whenever the event becomes true. + * + * This deviates from the usual norm of having one CQE per SQE submitted. Add a CQE flag, + * IORING_CQE_F_MORE, which tells the application to expect further completion events from the + * submitted SQE. Right now the only user of this is POLL_ADD in multishot mode. + * + * An application should expect more CQEs for the specificed SQE if the CQE is flagged with + * IORING_CQE_F_MORE. In multishot mode, only cancelation or an error will terminate the poll + * request, in which case the flag will be cleared. + * + * Note: available from Linux 5.13 + */ + ADD_MULTI = 1U << 0, + + /** + * `IORING_POLL_UPDATE_EVENTS` + * + * Update existing poll request, matching sqe->addr as the old user_data field. + * + * Note: available from Linux 5.13 + */ + UPDATE_EVENTS = 1U << 1, + + /** + * `IORING_POLL_UPDATE_USER_DATA` + * + * Update existing poll request, matching sqe->addr as the old user_data field. + * + * Note: available from Linux 5.13 + */ + UPDATE_USER_DATA = 1U << 2, +} + +/** + * Flags that can be used with the `cancel` operation. + */ +enum CancelFlags : uint +{ + /// `IORING_ASYNC_CANCEL_ALL` (from linux 5.19) + /// Flag that allows to cancel any request that matches they key. It completes with the number + /// of requests found and canceled, or res < 0 if an error occured. + CANCEL_ALL = 1U << 0, + + /// `IORING_ASYNC_CANCEL_FD` (from linux 5.19) + /// Tells the kernel that we're keying off the file fd instead of `user_data` for cancelation. + /// This allows canceling any request that a) uses a file, and b) was assigned the file based on + /// the value being passed in. + CANCEL_FD = 1U << 1, + + /// `IORING_ASYNC_CANCEL_ANY` (from linux 5.19) + /// Rather than match on a specific key, be it user_data or file, allow canceling any request + /// that we can lookup. Works like IORING_ASYNC_CANCEL_ALL in that it cancels multiple requests, + /// but it doesn't key off user_data or the file. + /// + /// Can't be set with IORING_ASYNC_CANCEL_FD, as that's a key selector. Only one may be used at + /// the time. + CANCEL_ANY = 1U << 2, +} + +// send/sendmsg and recv/recvmsg flags (sqe->ioprio) + +/// If set, instead of first attempting to send or receive and arm poll if that yields an `-EAGAIN` +/// result, arm poll upfront and skip the initial transfer attempt. +enum IORING_RECVSEND_POLL_FIRST = 1U << 0; + +/// Multishot recv. Sets IORING_CQE_F_MORE if the handler will continue to report CQEs on behalf of +/// the same SQE. +enum IORING_RECV_MULTISHOT = 1U << 1; + +/// Use registered buffers, the index is stored in the buf_index field. +enum IORING_RECVSEND_FIXED_BUF = 1U << 2; + +/// If set, SEND[MSG]_ZC should report the zerocopy usage in cqe.res for the IORING_CQE_F_NOTIF cqe. +/// 0 is reported if zerocopy was actually possible. IORING_NOTIF_USAGE_ZC_COPIED if data was copied +/// (at least partially). +enum IORING_SEND_ZC_REPORT_USAGE = 1U << 3; + +/// Accept flags stored in sqe->ioprio (since Linux 5.19) +enum IORING_ACCEPT_MULTISHOT = 1U << 0; + +/** + * Flags that can be used with the `accept4(2)` operation. + */ +enum AcceptFlags : uint +{ + /// Same as `accept()` + NONE = 0, + + /// Set the `O_NONBLOCK` file status flag on the new open file description. Using this flag saves + /// extra calls to `fcntl(2)` to achieve the same result. + NONBLOCK = 0x800, // octal 00004000 + + /// Set the close-on-exec (`FD_CLOEXEC`) flag on the new file descriptor. See the description of + /// the `O_CLOEXEC` flag in `open(2)` for reasons why this may be useful. + CLOEXEC = 0x80000 // octal 02000000 +} + +/** + * Describes the operation to be performed + * + * See_Also: `io_uring_enter(2)` + */ +enum Operation : ubyte +{ + // available from Linux 5.1 + NOP = 0, /// `IORING_OP_NOP` + READV = 1, /// `IORING_OP_READV` + WRITEV = 2, /// `IORING_OP_WRITEV` + FSYNC = 3, /// `IORING_OP_FSYNC` + READ_FIXED = 4, /// `IORING_OP_READ_FIXED` + WRITE_FIXED = 5, /// `IORING_OP_WRITE_FIXED` + POLL_ADD = 6, /// `IORING_OP_POLL_ADD` + POLL_REMOVE = 7, /// `IORING_OP_POLL_REMOVE` + + // available from Linux 5.2 + SYNC_FILE_RANGE = 8, /// `IORING_OP_SYNC_FILE_RANGE` + + // available from Linux 5.3 + SENDMSG = 9, /// `IORING_OP_SENDMSG` + RECVMSG = 10, /// `IORING_OP_RECVMSG` + + // available from Linux 5.4 + TIMEOUT = 11, /// `IORING_OP_TIMEOUT` + + // available from Linux 5.5 + TIMEOUT_REMOVE = 12, /// `IORING_OP_TIMEOUT_REMOVE` + ACCEPT = 13, /// `IORING_OP_ACCEPT` + ASYNC_CANCEL = 14, /// `IORING_OP_ASYNC_CANCEL` + LINK_TIMEOUT = 15, /// `IORING_OP_LINK_TIMEOUT` + CONNECT = 16, /// `IORING_OP_CONNECT` + + // available from Linux 5.6 + FALLOCATE = 17, /// `IORING_OP_FALLOCATE` + OPENAT = 18, /// `IORING_OP_OPENAT` + CLOSE = 19, /// `IORING_OP_CLOSE` + FILES_UPDATE = 20, /// `IORING_OP_FILES_UPDATE` + STATX = 21, /// `IORING_OP_STATX` + READ = 22, /// `IORING_OP_READ` + WRITE = 23, /// `IORING_OP_WRITE` + FADVISE = 24, /// `IORING_OP_FADVISE` + MADVISE = 25, /// `IORING_OP_MADVISE` + SEND = 26, /// `IORING_OP_SEND` + RECV = 27, /// `IORING_OP_RECV` + OPENAT2 = 28, /// `IORING_OP_OPENAT2` + EPOLL_CTL = 29, /// `IORING_OP_EPOLL_CTL` + + // available from Linux 5.7 + SPLICE = 30, /// `IORING_OP_SPLICE` + PROVIDE_BUFFERS = 31, /// `IORING_OP_PROVIDE_BUFFERS` + REMOVE_BUFFERS = 32, /// `IORING_OP_REMOVE_BUFFERS` + + // available from Linux 5.8 + TEE = 33, /// `IORING_OP_TEE` + + // available from Linux 5.11 + SHUTDOWN = 34, /// `IORING_OP_SHUTDOWN` + RENAMEAT = 35, /// `IORING_OP_RENAMEAT` - see renameat2() + UNLINKAT = 36, /// `IORING_OP_UNLINKAT` - see unlinkat(2) + + // available from Linux 5.15 + MKDIRAT = 37, /// `IORING_OP_MKDIRAT` - see mkdirat(2) + SYMLINKAT = 38, /// `IORING_OP_SYMLINKAT` - see symlinkat(2) + LINKAT = 39, /// `IORING_OP_LINKAT` - see linkat(2) + + // available from Linux 5.18 + MSG_RING = 40, /// `IORING_OP_MSG_RING` - allows an SQE to signal another ring + + // available from Linux 5.19 + FSETXATTR = 41, /// `IORING_OP_FSETXATTR` - see setxattr(2) + SETXATTR = 42, /// `IORING_OP_SETXATTR` - see setxattr(2) + FGETXATTR = 43, /// `IORING_OP_FGETXATTR` - see getxattr(2) + GETXATTR = 44, /// `IORING_OP_GETXATTR` - see getxattr(2) + SOCKET = 45, /// `IORING_OP_SOCKET` - see socket(2) + URING_CMD = 46, /// `IORING_OP_URING_CMD` +} + +/// sqe->flags +enum SubmissionEntryFlags : ubyte +{ + NONE = 0, + + /// Use fixed fileset (`IOSQE_FIXED_FILE`) + /// + /// When this flag is specified, fd is an index into the files array registered with the + /// io_uring instance (see the `IORING_REGISTER_FILES` section of the io_uring_register(2) man + /// page). + FIXED_FILE = 1U << 0, + + /** + * `IOSQE_IO_DRAIN`: issue after inflight IO + * + * If a request is marked with `IO_DRAIN`, then previous commands must complete before this one + * is issued. Subsequent requests are not started until the drain has completed. + * + * Note: available from Linux 5.2 + */ + IO_DRAIN = 1U << 1, + + /** + * `IOSQE_IO_LINK` + * + * If set, the next SQE in the ring will depend on this SQE. A dependent SQE will not be started + * until the parent SQE has completed. If the parent SQE fails, then a dependent SQE will be + * failed without being started. Link chains can be arbitrarily long, the chain spans any new + * SQE that continues tohave the IOSQE_IO_LINK flag set. Once an SQE is encountered that does + * not have this flag set, that defines the end of the chain. This features allows to form + * dependencies between individual SQEs. + * + * Note: available from Linux 5.3 + */ + IO_LINK = 1U << 2, + + /** + * `IOSQE_IO_HARDLINK` - like LINK, but stronger + * + * Some commands will invariably end in a failure in the sense that the + * completion result will be less than zero. One such example is timeouts + * that don't have a completion count set, they will always complete with + * `-ETIME` unless cancelled. + * + * For linked commands, we sever links and fail the rest of the chain if + * the result is less than zero. Since we have commands where we know that + * will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever + * regardless of the completion result. Note that the link will still sever + * if we fail submitting the parent request, hard links are only resilient + * in the presence of completion results for requests that did submit + * correctly. + * + * Note: available from Linux 5.5 + */ + IO_HARDLINK = 1U << 3, + + /** + * `IOSQE_ASYNC` + * + * io_uring defaults to always doing inline submissions, if at all possible. But for larger + * copies, even if the data is fully cached, that can take a long time. Add an IOSQE_ASYNC flag + * that the application can set on the SQE - if set, it'll ensure that we always go async for + * those kinds of requests. + * + * Note: available from Linux 5.6 + */ + ASYNC = 1U << 4, /* always go async */ + + /** + * `IOSQE_BUFFER_SELECT` + * If a server process has tons of pending socket connections, generally it uses epoll to wait + * for activity. When the socket is ready for reading (or writing), the task can select a buffer + * and issue a recv/send on the given fd. + * + * Now that we have fast (non-async thread) support, a task can have tons of pending reads or + * writes pending. But that means they need buffers to back that data, and if the number of + * connections is high enough, having them preallocated for all possible connections is + * unfeasible. + * + * With IORING_OP_PROVIDE_BUFFERS, an application can register buffers to use for any request. + * The request then sets IOSQE_BUFFER_SELECT in the sqe, and a given group ID in sqe->buf_group. + * When the fd becomes ready, a free buffer from the specified group is selected. If none are + * available, the request is terminated with -ENOBUFS. If successful, the CQE on completion will + * contain the buffer ID chosen in the cqe->flags member, encoded as: + * + * `(buffer_id << IORING_CQE_BUFFER_SHIFT) | IORING_CQE_F_BUFFER;` + * + * Once a buffer has been consumed by a request, it is no longer available and must be + * registered again with IORING_OP_PROVIDE_BUFFERS. + * + * Requests need to support this feature. For now, IORING_OP_READ and IORING_OP_RECV support it. + * This is checked on SQE submission, a CQE with res == -EOPNOTSUPP will be posted if attempted + * on unsupported requests. + * + * Note: available from Linux 5.7 + */ + BUFFER_SELECT = 1U << 5, /* select buffer from sqe->buf_group */ + + /** + * `IOSQE_CQE_SKIP_SUCCESS` - don't post CQE if request succeeded. + * + * Emitting a CQE is expensive from the kernel perspective. Often, it's also not convenient for + * the userspace, spends some cycles on processing and just complicates the logic. A similar + * problems goes for linked requests, where we post an CQE for each request in the link. + * + * Introduce a new flags, IOSQE_CQE_SKIP_SUCCESS, trying to help with it. When set and a request + * completed successfully, it won't generate a CQE. When fails, it produces an CQE, but all + * following linked requests will be CQE-less, regardless whether they have + * IOSQE_CQE_SKIP_SUCCESS or not. The notion of "fail" is the same as for link + * failing-cancellation, where it's opcode dependent, and _usually_ result >= 0 is a success, + * but not always. + * + * Linked timeouts are a bit special. When the requests it's linked to was not attempted to be + * executed, e.g. failing linked requests, it follows the description above. Otherwise, whether + * a linked timeout will post a completion or not solely depends on IOSQE_CQE_SKIP_SUCCESS of + * that linked timeout request. Linked timeout never "fail" during execution, so for them it's + * unconditional. It's expected for users to not really care about the result of it but rely + * solely on the result of the master request. Another reason for such a treatment is that it's + * racy, and the timeout callback may be running awhile the master request posts its completion. + * + * use case 1: If one doesn't care about results of some requests, e.g. normal timeouts, just + * set IOSQE_CQE_SKIP_SUCCESS. Error result will still be posted and need to be handled. + * + * use case 2: Set IOSQE_CQE_SKIP_SUCCESS for all requests of a link but the last, and it'll + * post a completion only for the last one if everything goes right, otherwise there will be one + * only one CQE for the first failed request. + * + * Note: available from Linux 5.17 + */ + CQE_SKIP_SUCCESS = 1U << 6, +} + +/** + * IO completion data structure (Completion Queue Entry) + * + * C API: `struct io_uring_cqe` + */ +struct CompletionEntry +{ + ulong user_data; /** sqe->data submission passed back */ + int res; /** result code for this event */ + CQEFlags flags; + + /* + * If the ring is initialized with `IORING_SETUP_CQE32`, then this field contains 16-bytes of + * padding, doubling the size of the CQE. + */ + ulong[0] big_cqe; +} + +/// Flags used with `CompletionEntry` +enum CQEFlags : uint +{ + NONE = 0, /// No flags set + + /// `IORING_CQE_F_BUFFER` (from Linux 5.7) + /// If set, the upper 16 bits are the buffer ID + BUFFER = 1U << 0, + + /// `IORING_CQE_F_MORE` (from Linux 5.13) + /// If set, parent SQE will generate more CQE entries + MORE = 1U << 1, + + /// `IORING_CQE_F_SOCK_NONEMPTY` (from Linux 5.19) + /// If set, more data to read after socket recv. + SOCK_NONEMPTY = 1U << 2, +} + +enum { + CQE_BUFFER_SHIFT = 16, /// Note: available from Linux 5.7 +} + +/** + * Passed in for io_uring_setup(2). Copied back with updated info on success. + * + * C API: `struct io_uring_params` + */ +struct SetupParameters +{ + // Magic offsets for the application to mmap the data it needs + + /// `IORING_OFF_SQ_RING`: mmap offset for submission queue ring + enum ulong SUBMISSION_QUEUE_RING_OFFSET = 0UL; + /// `IORING_OFF_CQ_RING`: mmap offset for completion queue ring + enum ulong COMPLETION_QUEUE_RING_OFFSET = 0x8000000UL; + /// `IORING_OFF_SQES`: mmap offset for submission entries + enum ulong SUBMISSION_QUEUE_ENTRIES_OFFSET = 0x10000000UL; + + /// (output) allocated entries in submission queue + /// (both ring index `array` and separate entry array at `SUBMISSION_QUEUE_ENTRIES_OFFSET`). + uint sq_entries; + + /// (output) allocated entries in completion queue + uint cq_entries; + + SetupFlags flags; /// (input) + + /// (input) used if SQ_AFF and SQPOLL flags are active to pin poll thread to specific cpu. + /// right now always checked in kernel for "possible cpu". + uint sq_thread_cpu; + + /// (input) used if SQPOLL flag is active; timeout in milliseconds + /// until kernel poll thread goes to sleep. + uint sq_thread_idle; + SetupFeatures features; /// (from Linux 5.4) + uint wq_fd; /// (from Linux 5.6) + private uint[3] resv; // reserved + SubmissionQueueRingOffsets sq_off; /// (output) submission queue ring data field offsets + CompletionQueueRingOffsets cq_off; /// (output) completion queue ring data field offsets +} + +/// `io_uring_setup()` flags +enum SetupFlags : uint +{ + /// No flags set + NONE = 0, + + /** + * `IORING_SETUP_IOPOLL` + * + * Perform busy-waiting for an I/O completion, as opposed to getting notifications via an + * asynchronous IRQ (Interrupt Request). The file system (if any) and block device must + * support polling in order for this to work. Busy-waiting provides lower latency, but may + * consume more CPU resources than interrupt driven I/O. Currently, this feature is usable + * only on a file descriptor opened using the O_DIRECT flag. When a read or write is submitted + * to a polled context, the application must poll for completions on the CQ ring by calling + * io_uring_enter(2). It is illegal to mix and match polled and non-polled I/O on an io_uring + * instance. + */ + IOPOLL = 1U << 0, + + /** + * `IORING_SETUP_SQPOLL` + * + * When this flag is specified, a kernel thread is created to perform submission queue polling. + * An io_uring instance configured in this way enables an application to issue I/O without ever + * context switching into the kernel. + * By using the submission queue to fill in new submission queue entries and watching for + * completions on the completion queue, the application can submit and reap I/Os without doing + * a single system call. + * If the kernel thread is idle for more than sq_thread_idle microseconds, it will set the + * IORING_SQ_NEED_WAKEUP bit in the flags field of the struct io_sq_ring. When this happens, + * the application must call io_uring_enter(2) to wake the kernel thread. If I/O is kept busy, + * the kernel thread will never sleep. An application making use of this feature will need to + * guard the io_uring_enter(2) call with the following code sequence: + * + * ```` + * // Ensure that the wakeup flag is read after the tail pointer has been written. + * smp_mb(); + * if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) + * io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); + * ``` + * + * where sq_ring is a submission queue ring setup using the struct io_sqring_offsets described below. + * + * To successfully use this feature, the application must register a set of files to be used for + * IO through io_uring_register(2) using the IORING_REGISTER_FILES opcode. Failure to do so will + * result in submitted IO being errored with EBADF. + */ + SQPOLL = 1U << 1, + + /** + * `IORING_SETUP_SQ_AFF` + * + * If this flag is specified, then the poll thread will be bound to the cpu set in the + * sq_thread_cpu field of the struct io_uring_params. This flag is only meaningful when + * IORING_SETUP_SQPOLL is specified. + */ + SQ_AFF = 1U << 2, + + /** + * `IORING_SETUP_CQSIZE` + * + * Create the completion queue with struct io_uring_params.cq_entries entries. The value must + * be greater than entries, and may be rounded up to the next power-of-two. + * + * Note: Available from Linux 5.5 + */ + CQSIZE = 1U << 3, + + /** + * `IORING_SETUP_CLAMP` + * + * Some applications like to start small in terms of ring size, and then ramp up as needed. This + * is a bit tricky to do currently, since we don't advertise the max ring size. + * + * This adds IORING_SETUP_CLAMP. If set, and the values for SQ or CQ ring size exceed what we + * support, then clamp them at the max values instead of returning -EINVAL. Since we return the + * chosen ring sizes after setup, no further changes are needed on the application side. + * io_uring already changes the ring sizes if the application doesn't ask for power-of-two + * sizes, for example. + * + * Note: Available from Linux 5.6 + */ + CLAMP = 1U << 4, /* clamp SQ/CQ ring sizes */ + + /** + * `IORING_SETUP_ATTACH_WQ` + * + * If IORING_SETUP_ATTACH_WQ is set, it expects wq_fd in io_uring_params to be a valid io_uring + * fd io-wq of which will be shared with the newly created io_uring instance. If the flag is set + * but it can't share io-wq, it fails. + * + * This allows creation of "sibling" io_urings, where we prefer to keep the SQ/CQ private, but + * want to share the async backend to minimize the amount of overhead associated with having + * multiple rings that belong to the same backend. + * + * Note: Available from Linux 5.6 + */ + ATTACH_WQ = 1U << 5, /* attach to existing wq */ + + /** + * `IORING_SETUP_R_DISABLED` flag to start the rings disabled, allowing the user to register + * restrictions, buffers, files, before to start processing SQEs. + * + * When `IORING_SETUP_R_DISABLED` is set, SQE are not processed and SQPOLL kthread is not started. + * + * The restrictions registration are allowed only when the rings are disable to prevent + * concurrency issue while processing SQEs. + * + * The rings can be enabled using `IORING_REGISTER_ENABLE_RINGS` opcode with io_uring_register(2). + * + * Note: Available from Linux 5.10 + */ + R_DISABLED = 1U << 6, /* start with ring disabled */ + + /** + * `IORING_SETUP_SUBMIT_ALL` + * + * Normally io_uring stops submitting a batch of request, if one of these + * requests results in an error. This can cause submission of less than + * what is expected, if a request ends in error while being submitted. If + * the ring is created with this flag, + * + * Note: Available from Linux 5.18 + */ + SUBMIT_ALL = 1U << 7, /* continue submit on error */ + + /** + * `IORING_SETUP_COOP_TASKRUN` + * + * By default, io_uring will interrupt a task running in userspace when a + * completion event comes in. This is to ensure that completions run in a timely + * manner. For a lot of use cases, this is overkill and can cause reduced + * performance from both the inter-processor interrupt used to do this, the + * kernel/user transition, the needless interruption of the tasks userspace + * activities, and reduced batching if completions come in at a rapid rate. Most + * applications don't need the forceful interruption, as the events are processed + * at any kernel/user transition. The exception are setups where the application + * uses multiple threads operating on the same ring, where the application + * waiting on completions isn't the one that submitted them. For most other + * use cases, setting this flag will improve performance. + * + * Note: Available since 5.19. + */ + COOP_TASKRUN = 1U << 8, + + /** + * `IORING_SETUP_TASKRUN_FLAG` + * + * If COOP_TASKRUN is set, get notified if task work is available for running and a kernel + * transition would be needed to run it. This sets IORING_SQ_TASKRUN in the sq ring flags. Not + * valid with COOP_TASKRUN. + * + * Note: Available since 5.19. + */ + TASKRUN_FLAG = 1U << 9, + + /// `IORING_SETUP_SQE128`: SQEs are 128 byte + /// Note: since Linux 5.19 + SQE128 = 1U << 10, + + /// `IORING_SETUP_CQE32`: CQEs are 32 byte + /// Note: since Linux 5.19 + CQE32 = 1U << 11, + + /// Only one task/thread is allowed to submit requests + /// + /// Note: Available since 6.1. + SINGLE_ISSUER = 1U << 12, + + /* + * Defer running task work to get events. + * Rather than running bits of task work whenever the task transitions + * try to do it just before it is needed. + * + * Note: Available since 6.1. + */ + DEFER_TASKRUN = 1U << 13, +} + +/// `io_uring_params->features` flags +enum SetupFeatures : uint +{ + NONE = 0, + + /** + * `IORING_FEAT_SINGLE_MMAP` (from Linux 5.4) + * + * Indicates that we can use single mmap feature to map both sq and cq rings and so to avoid the + * second mmap. + */ + SINGLE_MMAP = 1U << 0, + + /** + * `IORING_FEAT_NODROP` (from Linux 5.5) + * + * Currently we drop completion events, if the CQ ring is full. That's fine + * for requests with bounded completion times, but it may make it harder or + * impossible to use io_uring with networked IO where request completion + * times are generally unbounded. Or with POLL, for example, which is also + * unbounded. + * + * After this patch, we never overflow the ring, we simply store requests + * in a backlog for later flushing. This flushing is done automatically by + * the kernel. To prevent the backlog from growing indefinitely, if the + * backlog is non-empty, we apply back pressure on IO submissions. Any + * attempt to submit new IO with a non-empty backlog will get an -EBUSY + * return from the kernel. This is a signal to the application that it has + * backlogged CQ events, and that it must reap those before being allowed + * to submit more IO. + * + * Note that if we do return -EBUSY, we will have filled whatever + * backlogged events into the CQ ring first, if there's room. This means + * the application can safely reap events WITHOUT entering the kernel and + * waiting for them, they are already available in the CQ ring. + */ + NODROP = 1U << 1, + + /** + * `IORING_FEAT_SUBMIT_STABLE` (from Linux 5.5) + * + * If this flag is set, applications can be certain that any data for async offload has been + * consumed when the kernel has consumed the SQE. + */ + SUBMIT_STABLE = 1U << 2, + + /** + * `IORING_FEAT_RW_CUR_POS` (from Linux 5.6) + * + * If this flag is set, applications can know if setting `-1` as file offsets (meaning to work + * with current file position) is supported. + */ + RW_CUR_POS = 1U << 3, + + /** + * `IORING_FEAT_CUR_PERSONALITY` (from Linux 5.6) + * + * We currently setup the io_wq with a static set of mm and creds. Even for a single-use io-wq + * per io_uring, this is suboptimal as we have may have multiple enters of the ring. For + * sharing the io-wq backend, it doesn't work at all. + * + * Switch to passing in the creds and mm when the work item is setup. This means that async + * work is no longer deferred to the io_uring mm and creds, it is done with the current mm and + * creds. + * + * Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know they can rely on + * the current personality (mm and creds) being the same for direct issue and async issue. + */ + CUR_PERSONALITY = 1U << 4, + + /** + * `IORING_FEAT_FAST_POLL` (from Linux 5.7) + * + * Currently io_uring tries any request in a non-blocking manner, if it can, and then retries + * from a worker thread if we get -EAGAIN. Now that we have a new and fancy poll based retry + * backend, use that to retry requests if the file supports it. + * + * This means that, for example, an IORING_OP_RECVMSG on a socket no longer requires an async + * thread to complete the IO. If we get -EAGAIN reading from the socket in a non-blocking + * manner, we arm a poll handler for notification on when the socket becomes readable. When it + * does, the pending read is executed directly by the task again, through the io_uring task + * work handlers. Not only is this faster and more efficient, it also means we're not + * generating potentially tons of async threads that just sit and block, waiting for the IO to + * complete. + * + * The feature is marked with IORING_FEAT_FAST_POLL, meaning that async pollable IO is fast, + * and that pollother_op is fast as well. + */ + FAST_POLL = 1U << 5, + + /** + * `IORING_FEAT_POLL_32BITS` (from Linux 5.9) + * + * Poll events should be 32-bits to cover EPOLLEXCLUSIVE. + * Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We + * call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should + * check the feature bit first. + */ + POLL_32BITS = 1U << 6, + + /** + * `IORING_FEAT_SQPOLL_NONFIXED` (from Linux 5.11) + * + * The restriction of needing fixed files for SQPOLL is problematic, and prevents/inhibits + * several valid uses cases. With the referenced files_struct that we have now, it's trivially + * supportable. + * + * Treat ->files like we do the mm for the SQPOLL thread - grab a reference to it (and assign + * it), and drop it when we're done. + * + * This feature is exposed as IORING_FEAT_SQPOLL_NONFIXED. + */ + SQPOLL_NONFIXED = 1U << 7, + + /** + * `IORING_FEAT_EXT_ARG` (from Linux 5.11) + * + * Supports adding timeout to `existing io_uring_enter()` + */ + EXT_ARG = 1U << 8, + + /// `IORING_FEAT_NATIVE_WORKERS` (from Linux 5.12) + NATIVE_WORKERS = 1U << 9, + + /// `IORING_FEAT_RSRC_TAGS` (from Linux 5.13) + RSRC_TAGS = 1U << 10, + + /// `IORING_FEAT_CQE_SKIP` (from Linux 5.17) + CQE_SKIP = 1U << 11, + + /// `IORING_FEAT_LINKED_FILE` (from Linux 5.18) + LINKED_FILE = 1U << 12, +} + +/** + * Filled with the offset for mmap(2) + * + * C API: `struct io_sqring_offsets` + */ +struct SubmissionQueueRingOffsets +{ + /// Incremented by kernel after entry at `head` was processed. + /// Pending submissions: [head..tail] + uint head; + + /// Modified by user space when new entry was queued; points to next + /// entry user space is going to fill. + uint tail; + + /// value `value_at(self.ring_entries) - 1` + /// mask for indices at `head` and `tail` (don't delete masked bits! + /// `head` and `tail` can point to the same entry, but if they are + /// not exactly equal it implies the ring is full, and if they are + /// exactly equal the ring is empty.) + uint ring_mask; + + /// value same as SetupParameters.sq_entries, power of 2. + uint ring_entries; + + /// SubmissionQueueFlags + SubmissionQueueFlags flags; + + /// number of (invalid) entries that were dropped; entries are + /// invalid if their index (in `array`) is out of bounds. + uint dropped; + + /// index into array of `SubmissionEntry`s at offset `SUBMISSION_QUEUE_ENTRIES_OFFSET` in mmap() + uint array; + + private uint[3] resv; // reserved +} + +enum SubmissionQueueFlags: uint +{ + NONE = 0, + + /// `IORING_SQ_NEED_WAKEUP`: needs io_uring_enter wakeup + /// set by kernel poll thread when it goes sleeping, and reset on wakeup + NEED_WAKEUP = 1U << 0, + + /// `IORING_SQ_CQ_OVERFLOW`: CQ ring is overflown + /// For those applications which are not willing to use io_uring_enter() to reap and handle + /// cqes, they may completely rely on liburing's io_uring_peek_cqe(), but if cq ring has + /// overflowed, currently because io_uring_peek_cqe() is not aware of this overflow, it won't + /// enter kernel to flush cqes. + /// To fix this issue, export cq overflow status to userspace by adding new + /// IORING_SQ_CQ_OVERFLOW flag, then helper functions() in liburing, such as io_uring_peek_cqe, + /// can be aware of this cq overflow and do flush accordingly. + /// + /// Note: Since Linux 5.8 + CQ_OVERFLOW = 1U << 1, + + /// `IORING_SQ_TASKRUN`: task should enter the kernel + /// If IORING_SETUP_COOP_TASKRUN is set to use cooperative scheduling for running task_work, + /// then IORING_SETUP_TASKRUN_FLAG can be set so the application can tell if task_work is + /// pending in the kernel for this ring. This allows use cases like io_uring_peek_cqe() to still + /// function appropriately, or for the task to know when it would be useful to call + /// io_uring_wait_cqe() to run pending events. + /// + /// Note: since Linux 5.19 + TASKRUN = 1U << 2, +} + +/** + * Field offsets used to map kernel structure to our. + * + * C API: `struct io_cqring_offsets` + */ +struct CompletionQueueRingOffsets +{ + /// incremented by user space after entry at `head` was processed. + /// available entries for processing: [head..tail] + uint head; + + /// modified by kernel when new entry was created; points to next + /// entry kernel is going to fill. + uint tail; + + /// value `value_at(ring_entries) - 1` + /// mask for indices at `head` and `tail` (don't delete masked bits! + /// `head` and `tail` can point to the same entry, but if they are + /// not exactly equal it implies the ring is full, and if they are + /// exactly equal the ring is empty.) + uint ring_mask; + + /// value same as SetupParameters.cq_entries, power of 2. + uint ring_entries; + + /// incremented by the kernel every time it failed to queue a + /// completion event because the ring was full. + uint overflow; + + /// Offset to array of completion queue entries + uint cqes; + + CQRingFlags flags; /// (available from Linux 5.8) + private uint _resv1; + private ulong _resv2; +} + +/// CompletionQueue ring flags +enum CQRingFlags : uint +{ + NONE = 0, /// No flags set + + /// `IORING_CQ_EVENTFD_DISABLED` disable eventfd notifications (available from Linux 5.8) + /// This new flag should be set/clear from the application to disable/enable eventfd notifications when a request is completed and queued to the CQ ring. + /// + /// Before this patch, notifications were always sent if an eventfd is registered, so IORING_CQ_EVENTFD_DISABLED is not set during the initialization. + /// It will be up to the application to set the flag after initialization if no notifications are required at the beginning. + EVENTFD_DISABLED = 1U << 0, +} + +/// io_uring_register(2) opcodes and arguments +enum RegisterOpCode : uint +{ + /** + * `arg` points to a struct iovec array of nr_args entries. The buffers associated with the + * iovecs will be locked in memory and charged against the user's RLIMIT_MEMLOCK resource limit. + * See getrlimit(2) for more informa‐ tion. Additionally, there is a size limit of 1GiB per + * buffer. Currently, the buffers must be anonymous, non-file-backed memory, such as that + * returned by malloc(3) or mmap(2) with the MAP_ANONYMOUS flag set. It is expected that this + * limitation will be lifted in the future. Huge pages are supported as well. Note that the + * entire huge page will be pinned in the kernel, even if only a portion of it is used. + * + * After a successful call, the supplied buffers are mapped into the kernel and eligible for + * I/O. To make use of them, the application must specify the IORING_OP_READ_FIXED or + * IORING_OP_WRITE_FIXED opcodes in the submis‐ sion queue entry (see the struct io_uring_sqe + * definition in io_uring_enter(2)), and set the buf_index field to the desired buffer index. + * The memory range described by the submission queue entry's addr and len fields must fall + * within the indexed buffer. + * + * It is perfectly valid to setup a large buffer and then only use part of it for an I/O, as + * long as the range is within the originally mapped region. + * + * An application can increase or decrease the size or number of registered buffers by first + * unregistering the existing buffers, and then issuing a new call to io_uring_register() with + * the new buffers. + * + * An application need not unregister buffers explicitly before shutting down the io_uring + * instance. + * + * `IORING_REGISTER_BUFFERS` + */ + REGISTER_BUFFERS = 0, + + /** + * This operation takes no argument, and `arg` must be passed as NULL. All previously registered + * buffers associated with the io_uring instance will be released. + * + * `IORING_UNREGISTER_BUFFERS` + */ + UNREGISTER_BUFFERS = 1, + + /** + * Register files for I/O. `arg` contains a pointer to an array of `nr_args` file descriptors + * (signed 32 bit integers). + * + * To make use of the registered files, the IOSQE_FIXED_FILE flag must be set in the flags + * member of the struct io_uring_sqe, and the fd member is set to the index of the file in the + * file descriptor array. + * + * Files are automatically unregistered when the io_uring instance is torn down. An application + * need only unregister if it wishes to register a new set of fds. + * + * `IORING_REGISTER_FILES` + */ + REGISTER_FILES = 2, + + /** + * This operation requires no argument, and `arg` must be passed as NULL. All previously + * registered files associated with the io_uring instance will be unregistered. + * + * `IORING_UNREGISTER_FILES` + */ + UNREGISTER_FILES = 3, + + /** + * `IORING_REGISTER_EVENTFD` + * + * Registers eventfd that would be used to notify about completions on io_uring itself. + * + * Note: available from Linux 5.2 + */ + REGISTER_EVENTFD = 4, + + /** + * `IORING_UNREGISTER_EVENTFD` + * + * Unregisters previously registered eventfd. + * + * Note: available from Linux 5.2 + */ + UNREGISTER_EVENTFD = 5, + + /// `IORING_REGISTER_FILES_UPDATE` (from Linux 5.5) + REGISTER_FILES_UPDATE = 6, + + /** + * `IORING_REGISTER_EVENTFD_ASYNC` (from Linux 5.6) + * + * If an application is using eventfd notifications with poll to know when new SQEs can be + * issued, it's expecting the following read/writes to complete inline. And with that, it knows + * that there are events available, and don't want spurious wakeups on the eventfd for those + * requests. + * + * This adds IORING_REGISTER_EVENTFD_ASYNC, which works just like IORING_REGISTER_EVENTFD, + * except it only triggers notifications for events that happen from async completions (IRQ, or + * io-wq worker completions). Any completions inline from the submission itself will not + * trigger notifications. + */ + REGISTER_EVENTFD_ASYNC = 7, + + /** + * `IORING_REGISTER_PROBE` (from Linux 5.6) + * + * The application currently has no way of knowing if a given opcode is supported or not + * without having to try and issue one and see if we get -EINVAL or not. And even this approach + * is fraught with peril, as maybe we're getting -EINVAL due to some fields being missing, or + * maybe it's just not that easy to issue that particular command without doing some other leg + * work in terms of setup first. + * + * This adds IORING_REGISTER_PROBE, which fills in a structure with info on what it supported + * or not. This will work even with sparse opcode fields, which may happen in the future or + * even today if someone backports specific features to older kernels. + */ + REGISTER_PROBE = 8, + + /** + * `IORING_REGISTER_PERSONALITY` (from Linux 5.6) + * + * If an application wants to use a ring with different kinds of credentials, it can register + * them upfront. We don't lookup credentials, the credentials of the task calling + * IORING_REGISTER_PERSONALITY is used. + * + * An 'id' is returned for the application to use in subsequent personality support. + */ + REGISTER_PERSONALITY = 9, + + /// `IORING_UNREGISTER_PERSONALITY` (from Linux 5.6) + UNREGISTER_PERSONALITY = 10, + + /** + * `IORING_REGISTER_RESTRICTIONS` (from Linux 5.10) + * + * Permanently installs a feature allowlist on an io_ring_ctx. The io_ring_ctx can then be + * passed to untrusted code with the knowledge that only operations present in the allowlist can + * be executed. + * + * The allowlist approach ensures that new features added to io_uring do not accidentally become + * available when an existing application is launched on a newer kernel version. + * + * Currently it's possible to restrict sqe opcodes, sqe flags, and register opcodes. + * + * `IOURING_REGISTER_RESTRICTIONS` can only be made once. Afterwards it is not possible to + * change restrictions anymore. This prevents untrusted code from removing restrictions. + */ + REGISTER_RESTRICTIONS = 11, + + /** + *`IORING_REGISTER_ENABLE_RINGS` (from Linux 5.10) + * + * This operation is to be used when rings are disabled on start with `IORING_SETUP_R_DISABLED`. + */ + ENABLE_RINGS = 12, + + /** + * `IORING_REGISTER_FILES2` (from Linux 5.13) + */ + REGISTER_FILES2 = 13, + + /** + * `IORING_REGISTER_FILES_UPDATE2` (from Linux 5.13) + */ + REGISTER_FILES_UPDATE2 = 14, + + /** + * `IORING_REGISTER_BUFFERS2` (from Linux 5.13) + */ + REGISTER_BUFFERS2 = 15, + + /** + * `IORING_REGISTER_BUFFERS_UPDATE` (from Linux 5.13) + */ + REGISTER_BUFFERS_UPDATE = 16, + + /* set/clear io-wq thread affinities */ + /// `IORING_REGISTER_IOWQ_AFF` (from Linux 5.14) + REGISTER_IOWQ_AFF = 17, + + /// `IORING_UNREGISTER_IOWQ_AFF` (from Linux 5.14) + UNREGISTER_IOWQ_AFF = 18, + + /// `IORING_REGISTER_IOWQ_MAX_WORKERS` (from Linux 5.15) + /// set/get max number of io-wq workers + REGISTER_IOWQ_MAX_WORKERS = 19, + + /* register/unregister io_uring fd with the ring */ + /// `IORING_REGISTER_RING_FDS` (from Linux 5.18) + REGISTER_RING_FDS = 20, + + /// `IORING_UNREGISTER_RING_FDS` (from Linux 5.18) + UNREGISTER_RING_FDS = 21, + + /* register ring based provide buffer group */ + REGISTER_PBUF_RING = 22, /// `IORING_REGISTER_PBUF_RING` (from Linux 5.19) + UNREGISTER_PBUF_RING = 23, /// `IORING_UNREGISTER_PBUF_RING` (from Linux 5.19) +} + +/* io-wq worker categories */ +enum IOWQCategory +{ + BOUND, /// `IO_WQ_BOUND` + UNBOUND, /// `IO_WQ_UNBOUND` +} + +/// io_uring_enter(2) flags +enum EnterFlags: uint +{ + NONE = 0, + GETEVENTS = 1U << 0, /// `IORING_ENTER_GETEVENTS` + SQ_WAKEUP = 1U << 1, /// `IORING_ENTER_SQ_WAKEUP` + + /** + * `IORING_ENTER_SQ_WAIT` (from Linux 5.10) + * + * When using SQPOLL, applications can run into the issue of running out of SQ ring entries + * because the thread hasn't consumed them yet. The only option for dealing with that is + * checking later, or busy checking for the condition. + */ + SQ_WAIT = 1U << 2, + + /** + * `IORING_ENTER_EXT_ARG` (from Linux 5.11) + * + * Adds support for timeout to existing io_uring_enter() function. + */ + EXT_ARG = 1U << 3, + + /** + * `IORING_ENTER_REGISTERED_RING` (from Linux 5.18) + * + * Lots of workloads use multiple threads, in which case the file table is shared between them. + * This makes getting and putting the ring file descriptor for each io_uring_enter(2) system + * call more expensive, as it involves an atomic get and put for each call. + * + * Similarly to how we allow registering normal file descriptors to avoid this overhead, add + * support for an io_uring_register(2) API that allows to register the ring fds themselves. + */ + ENTER_REGISTERED_RING = 1U << 4, +} + +/// Time specification as defined in kernel headers (used by TIMEOUT operations) +struct KernelTimespec +{ + long tv_sec; /// seconds + long tv_nsec; /// nanoseconds +} + +static assert(CompletionEntry.sizeof == 16); +static assert(CompletionQueueRingOffsets.sizeof == 40); +static assert(SetupParameters.sizeof == 120); +static assert(SubmissionEntry.sizeof == 64); +static assert(SubmissionQueueRingOffsets.sizeof == 40); + +/// Indicating that OP is supported by the kernel +enum IO_URING_OP_SUPPORTED = 1U << 0; + +/* + * Register a fully sparse file space, rather than pass in an array of all -1 file descriptors. + * + * Note: Available from Linux 5.19 + */ +enum IORING_RSRC_REGISTER_SPARSE = 1U << 0; + +/** + * Skip updating fd indexes set to this value in the fd table + * + * Support for skipping a file descriptor when using `IORING_REGISTER_FILES_UPDATE`. + * `__io_sqe_files_update` will skip fds set to `IORING_REGISTER_FILES_SKIP` + * + * Note: Available from Linux 5.12 + */ +enum IORING_REGISTER_FILES_SKIP = -2; + +struct io_uring_probe_op +{ + ubyte op; + ubyte resv; + ushort flags; /* IO_URING_OP_* flags */ + private uint resv2; +} + +static assert(io_uring_probe_op.sizeof == 8); + +struct io_uring_probe +{ + ubyte last_op; /* last opcode supported */ + ubyte ops_len; /* length of ops[] array below */ + private ushort resv; + private uint[3] resv2; + io_uring_probe_op[0] ops; +} + +static assert(io_uring_probe.sizeof == 16); + +struct io_uring_restriction +{ + RestrictionOp opcode; + union + { + ubyte register_op; /// IORING_RESTRICTION_REGISTER_OP + ubyte sqe_op; /// IORING_RESTRICTION_SQE_OP + ubyte sqe_flags; /// IORING_RESTRICTION_SQE_FLAGS_* + } + ubyte resv; + uint[3] resv2; +} + +struct io_uring_buf +{ + ulong addr; + uint len; + ushort bid; + ushort resv; +} + +struct io_uring_buf_ring +{ + union + { + /* + * To avoid spilling into more pages than we need to, the + * ring tail is overlaid with the io_uring_buf->resv field. + */ + struct + { + ulong resv1; + uint resv2; + ushort resv3; + ushort tail; + } + io_uring_buf[0] bufs; + } +} + +/* argument for IORING_(UN)REGISTER_PBUF_RING */ +struct io_uring_buf_reg +{ + ulong ring_addr; + uint ring_entries; + ushort bgid; + ushort pad; + ulong[3] resv; +} + +/** + * io_uring_restriction->opcode values + */ +enum RestrictionOp : ushort +{ + /// Allow an io_uring_register(2) opcode + IORING_RESTRICTION_REGISTER_OP = 0, + + /// Allow an sqe opcode + IORING_RESTRICTION_SQE_OP = 1, + + /// Allow sqe flags + IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2, + + /// Require sqe flags (these flags must be set on each submission) + IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, +} + +struct io_uring_getevents_arg +{ + ulong sigmask; + uint sigmask_sz; + uint pad; + ulong ts; +} + +@system: + +/** + * Setup a context for performing asynchronous I/O. + * + * The `io_uring_setup()` system call sets up a submission queue (SQ) and completion queue (CQ) with + * at least entries entries, and returns a file descriptor which can be used to perform subsequent + * operations on the io_uring instance. The submission and completion queues are shared between + * userspace and the kernel, which eliminates the need to copy data when initiating and completing + * I/O. + * + * See_Also: `io_uring_setup(2)` + * + * Params: + * entries = Defines how many entries can submission queue hold. + * p = `SetupParameters` + * + * Returns: + * `io_uring_setup(2)` returns a new file descriptor on success. The application may then provide + * the file descriptor in a subsequent `mmap(2)` call to map the submission and completion queues, + * or to the `io_uring_register(2)` or `io_uring_enter(2)` system calls. + * + * On error, -1 is returned and `errno` is set appropriately. + */ +int io_uring_setup(uint entries, scope ref SetupParameters p) @trusted +{ + pragma(inline); + return syscall(SYS_io_uring_setup, entries, &p); +} + +/** + * Initiate and/or complete asynchronous I/O + * + * `io_uring_enter()` is used to initiate and complete I/O using the shared submission and + * completion queues setup by a call to `io_uring_setup(2)`. A single call can both submit new I/O + * and wait for completions of I/O initiated by this call or previous calls to `io_uring_enter()``. + * + * When the system call returns that a certain amount of SQEs have been consumed and submitted, it's + * safe to reuse SQE entries in the ring. This is true even if the actual IO submission had to be + * punted to async context, which means that the SQE may in fact not have been submitted yet. If the + * kernel requires later use of a particular SQE entry, it will have made a private copy of it. + * + * Note: For interrupt driven I/O (where `IORING_SETUP_IOPOLL` was not specified in the call to + * `io_uring_setup(2)`), an application may check the completion queue for event completions without + * entering the kernel at all. + * + * See_Also: `io_uring_enter(2)` + * + * Params: + * fd = the file descriptor returned by io_uring_setup(2). + * to_submit = specifies the number of I/Os to submit from the submission queue. + * min_complete = If the `IORING_ENTER_GETEVENTS` bit is set in flags, then the system call will attempt + * to wait for `min_complete` event completions before returning. If the io_uring instance was configured + * for polling, by specifying IORING_SETUP_IOPOLL in the call to io_uring_setup(2), then + * min_complete has a slightly different meaning. Passing a value of 0 instructs the kernel to + * return any events which are already complete, without blocking. If min_complete is a non-zero + * value, the kernel will still return immediately if any completion events are available. If + * no event completions are available, then the call will poll either until one or more + * completions become available, or until the process has exceeded its scheduler time slice. + * flags = Behavior modification flags - `EnterFlags` + * sig = a pointer to a signal mask (see `sigprocmask(2)`); if sig is not `null`, `io_uring_enter()` + * first replaces the current signal mask by the one pointed to by sig, then waits for events to + * become available in the completion queue, and then restores the original signal mask. The + * following `io_uring_enter()` call: + * + * ``` + * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig); + * ``` + * + * is equivalent to atomically executing the following calls: + * + * ``` + * pthread_sigmask(SIG_SETMASK, &sig, &orig); + * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL); + * pthread_sigmask(SIG_SETMASK, &orig, NULL); + * ``` + * + * See the description of `pselect(2)` for an explanation of why the sig parameter is necessary. + * + * Returns: + */ +int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const sigset_t* sig = null) +{ + pragma(inline); + return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigset_t.sizeof); +} + +/// ditto +int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const io_uring_getevents_arg* args) +{ + pragma(inline); + return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, args, io_uring_getevents_arg.sizeof); +} + +/** + * Register files or user buffers for asynchronous I/O. + * + * The `io_uring_register()` system call registers user buffers or files for use in an `io_uring(7)` + * instance referenced by fd. Registering files or user buffers allows the kernel to take long term + * references to internal data structures or create long term mappings of application memory, + * greatly reducing per-I/O overhead. + * + * See_Also: `io_uring_register(2) + * + * Params: + * fd = the file descriptor returned by a call to io_uring_setup(2) + * opcode = code of operation to execute on args + * arg = Args used by specified operation. See `RegisterOpCode` for usage details. + * nr_args = number of provided arguments + * + * Returns: On success, io_uring_register() returns 0. On error, -1 is returned, and errno is set accordingly. + */ +int io_uring_register(int fd, RegisterOpCode opcode, const(void)* arg, uint nr_args) +{ + pragma(inline); + return syscall(SYS_io_uring_register, fd, opcode, arg, nr_args); +} + +private: + +// Syscalls +enum +{ + SYS_io_uring_setup = 425, + SYS_io_uring_enter = 426, + SYS_io_uring_register = 427 +} + +extern (C): + +/// Invoke `system call' number `sysno`, passing it the remaining arguments. +int syscall(int sysno, ...); \ No newline at end of file diff --git a/source/during/openat2.d b/source/during/openat2.d new file mode 100644 index 0000000..9ad97a2 --- /dev/null +++ b/source/during/openat2.d @@ -0,0 +1,30 @@ +module during.openat2; + +/** + * Arguments for how `openat2(2)` should open the target path. If only `flags` and `mode` are + * non-zero, then `openat2(2)`` operates very similarly to openat(2). + * + * However, unlike openat(2), unknown or invalid bits in @flags result in `-EINVAL` rather than + * being silently ignored. @mode must be zero unless one of `O_CREAT`, `O_TMPFILE` are set. + */ +struct OpenHow +{ + ulong flags; /// O_* flags + ulong mode; /// O_CREAT/O_TMPFILE file mode + Resolve resolve; /// Resolve flags +} + +/// Resolve flags +enum Resolve : ulong +{ + /// Block mount-point crossings (includes bind-mounts). + RESOLVE_NO_XDEV = 0x01, + /// Block traversal through procfs-style "magic-links". + RESOLVE_NO_MAGICLINKS = 0x02, + /// Block traversal through all symlinks (implies OEXT_NO_MAGICLINKS) + RESOLVE_NO_SYMLINKS = 0x04, + /// Block "lexical" trickery like "..", symlinks, and absolute paths which escape the dirfd. + RESOLVE_BENEATH = 0x08, + /// Make all jumps to "/" and ".." be scoped inside the dirfd (similar to chroot(2)). + RESOLVE_IN_ROOT = 0x10 +} \ No newline at end of file diff --git a/source/during/package.d b/source/during/package.d new file mode 100644 index 0000000..dd8a704 --- /dev/null +++ b/source/during/package.d @@ -0,0 +1,1981 @@ +/** + * Simple idiomatic dlang wrapper around linux io_uring + * (see: https://kernel.dk/io_uring.pdf) asynchronous API. + */ +module during; + +version(linux): + +public import during.io_uring; +import during.openat2; + +import core.atomic : MemoryOrder; +debug import core.stdc.stdio; +import core.stdc.stdlib; +import core.sys.linux.epoll; +import core.sys.linux.errno; +import core.sys.linux.sched; +import core.sys.linux.sys.mman; +import core.sys.linux.unistd; +import core.sys.posix.fcntl; +import core.sys.posix.signal; +import core.sys.posix.sys.socket; +import core.sys.posix.sys.types; +import core.sys.posix.sys.uio; +import std.algorithm.comparison : among; +import std.traits : Unqual; + +nothrow @nogc: + +/** + * Setup new instance of io_uring into provided `Uring` structure. + * + * Params: + * uring = `Uring` structure to be initialized (must not be already initialized) + * entries = Number of entries to initialize uring with + * flags = `SetupFlags` to use to initialize uring. + * + * Returns: On succes it returns 0, `-errno` otherwise. + */ +int setup(ref Uring uring, uint entries = 128, SetupFlags flags = SetupFlags.NONE) @safe +{ + SetupParameters params; + params.flags = flags; + return setup(uring, entries, params); +} + +/** + * Setup new instance of io_uring into provided `Uring` structure. + * + * Params: + * uring = `Uring` structure to be initialized (must not be already initialized) + * entries = Number of entries to initialize uring with + * params = `SetupParameters` to use to initialize uring. + * + * Returns: On succes it returns 0, `-errno` otherwise. + */ +int setup(ref Uring uring, uint entries, ref const SetupParameters params) @safe +{ + assert(uring.payload is null, "Uring is already initialized"); + uring.payload = () @trusted { return cast(UringDesc*)calloc(1, UringDesc.sizeof); }(); + if (uring.payload is null) return -errno; + + uring.payload.params = params; + uring.payload.refs = 1; + auto r = io_uring_setup(entries, uring.payload.params); + if (r < 0) return -errno; + + uring.payload.fd = r; + + r = uring.payload.mapRings(); + if (r < 0) + { + dispose(uring); + return r; + } + + // debug printf("uring(%d): setup\n", uring.payload.fd); + + return 0; +} + +/** + * Simplified wrapper around `io_uring_probe` that is used to check what io_uring operations current + * kernel is actually supporting. + */ +struct Probe +{ + static assert (Operation.max < 64, "Needs to be adjusted"); + private + { + io_uring_probe probe; + io_uring_probe_op[64] ops; + int err; + } + + const @safe pure nothrow @nogc: + + /// Is operation supported? + bool isSupported(Operation op) + in (op <= Operation.max, "Invalid operation") + { + if (op > probe.last_op) return false; + assert(ops[op].op == op, "Operations differs"); + return (ops[op].flags & IO_URING_OP_SUPPORTED) != 0; + } + + /// Error code when we fail to get `Probe`. + @property int error() { return err; } + + /// `true` if probe was sucesfully retrieved. + T opCast(T)() if (is(T == bool)) { return err == 0; } +} + +/// Probes supported operations on a temporary created uring instance +Probe probe() @safe nothrow @nogc +{ + Uring io; + immutable ret = io.setup(2); + if (ret < 0) { + Probe res; + res.err = ret; + return res; + } + + return io.probe(); +} + +/** + * Main entry point to work with io_uring. + * + * It hides `SubmissionQueue` and `CompletionQueue` behind standard range interface. + * We put in `SubmissionEntry` entries and take out `CompletionEntry` entries. + * + * Use predefined `prepXX` methods to fill required fields of `SubmissionEntry` before `put` or during `putWith`. + * + * Note: `prepXX` functions doesn't touch previous entry state, just fills in operation properties. This is because for + * less error prone interface it is cleared automatically when prepared using `putWith`. So when using on own `SubmissionEntry` + * (outside submission queue), that would be added to the submission queue using `put`, be sure its cleared if it's + * reused for multiple operations. + */ +struct Uring +{ + nothrow @nogc: + + private UringDesc* payload; + + /// Copy constructor + this(ref return scope Uring rhs) @safe pure + { + assert(rhs.payload !is null, "rhs payload is null"); + // debug printf("uring(%d): copy\n", rhs.payload.fd); + this.payload = rhs.payload; + this.payload.refs++; + } + + /// Destructor + ~this() @safe + { + dispose(this); + } + + /// Probes supported operations + Probe probe() @safe + in (payload !is null, "Uring hasn't been initialized yet") + { + Probe res; + immutable ret = () @trusted { return io_uring_register( + payload.fd, + RegisterOpCode.REGISTER_PROBE, + cast(void*)&res.probe, res.ops.length + ); }(); + if (ret < 0) res.err = ret; + return res; + } + + /// Native io_uring file descriptor + int fd() const @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.fd; + } + + /// io_uring parameters + SetupParameters params() const @safe pure return + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.params; + } + + /// Check if there is some `CompletionEntry` to process. + bool empty() const @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.cq.empty; + } + + /// Check if there is space for another `SubmissionEntry` to submit. + bool full() const @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.sq.full; + } + + /// Available space in submission queue before it becomes full + size_t capacity() const @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.sq.capacity; + } + + /// Number of entries in completion queue + size_t length() const @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.cq.length; + } + + /// Get first `CompletionEntry` from cq ring + ref CompletionEntry front() @safe pure return + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.cq.front; + } + + /// Move to next `CompletionEntry` + void popFront() @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.cq.popFront; + } + + /** + * Adds new entry to the `SubmissionQueue`. + * + * Note that this just adds entry to the queue and doesn't advance the tail + * marker kernel sees. For that `finishSq()` is needed to be called next. + * + * Also note that to actually enter new entries to kernel, + * it's needed to call `submit()`. + * + * Params: + * FN = Function to fill next entry in queue by `ref` (should be faster). + * It is expected to be in a form of `void function(ARGS)(ref SubmissionEntry, auto ref ARGS)`. + * Note that in this case queue entry is cleaned first before function is called. + * entry = Custom built `SubmissionEntry` to be posted as is. + * Note that in this case it is copied whole over one in the `SubmissionQueue`. + * args = Optional arguments passed to the function + * + * Returns: reference to `Uring` structure so it's possible to chain multiple commands. + */ + ref Uring put()(auto ref SubmissionEntry entry) @safe pure return + in (payload !is null, "Uring hasn't been initialized yet") + { + payload.sq.put(entry); + return this; + } + + /// ditto + ref Uring putWith(alias FN, ARGS...)(auto ref ARGS args) return + in (payload !is null, "Uring hasn't been initialized yet") + { + import std.functional : forward; + payload.sq.putWith!FN(forward!args); + return this; + } + + /** + * Similar to `put(SubmissionEntry)` but in this case we can provide our custom type (args) to be filled + * to next `SubmissionEntry` in queue. + * + * Fields in the provided type must use the same names as in `SubmissionEntry` to be automagically copied. + * + * Params: + * op = Custom operation definition. + * Returns: + */ + ref Uring put(OP)(auto ref OP op) return + if (!is(OP == SubmissionEntry)) + in (payload !is null, "Uring hasn't been initialized yet") + { + payload.sq.put(op); + return this; + } + + /** + * Advances the userspace submision queue and returns last `SubmissionEntry`. + */ + ref SubmissionEntry next()() @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.sq.next(); + } + + /** + * If completion queue is full, the new event maybe dropped. + * This value records number of dropped events. + */ + uint overflow() const @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.cq.overflow; + } + + /// Counter of invalid submissions (out-of-bound index in submission array) + uint dropped() const @safe pure + in (payload !is null, "Uring hasn't been initialized yet") + { + return payload.sq.dropped; + } + + /** + * Submits qued `SubmissionEntry` to be processed by kernel. + * + * Params: + * want = number of `CompletionEntries` to wait for. + * If 0, this just submits queued entries and returns. + * If > 0, it blocks until at least wanted number of entries were completed. + * sig = See io_uring_enter(2) man page + * + * Returns: Number of submitted entries on success, `-errno` on error + */ + int submit(S)(uint want, const scope S* args) + if (is(S == sigset_t) || is(S == io_uring_getevents_arg)) + in (payload !is null, "Uring hasn't been initialized yet") + { + if (_expect(want > 0, false)) return submitAndWait(want, args); + return submit(args); + } + + /// ditto + int submit(uint want) @safe + { + pragma(inline, true) + if (_expect(want > 0, true)) return submitAndWait(want, cast(sigset_t*)null); + return submit(cast(sigset_t*)null); + } + + /// ditto + int submit(S)(const scope S* args) @trusted + if (is(S == sigset_t) || is(S == io_uring_getevents_arg)) + in (payload !is null, "Uring hasn't been initialized yet") + { + immutable len = cast(int)payload.sq.length; + if (_expect(len > 0, true)) // anything to submit? + { + payload.sq.flushTail(); // advance queue index + + EnterFlags flags; + if (payload.params.flags & SetupFlags.SQPOLL) + { + if (_expect(payload.sq.flags & SubmissionQueueFlags.NEED_WAKEUP, false)) + flags |= EnterFlags.SQ_WAKEUP; + else return len; // fast poll + } + static if (is(S == io_uring_getevents_arg)) + flags |= EnterFlags.EXT_ARG; + immutable r = io_uring_enter(payload.fd, len, 0, flags, args); + if (_expect(r < 0, false)) return -errno; + return r; + } + return 0; + } + + /// ditto + int submit() @safe + { + pragma(inline, true) + return submit(cast(sigset_t*)null); + } + + /** + * Flushes submission queue index to the kernel. + * Doesn't call any syscall, it just advances the SQE queue for kernel. + * This can be used with `IORING_SETUP_SQPOLL` when kernel polls the submission queue. + */ + void flush() @safe + { + payload.sq.flushTail(); // advance queue index + } + + /** + * Simmilar to `submit` but with this method we just wait for required number + * of `CompletionEntries`. + * + * Returns: `0` on success, `-errno` on error + */ + int wait(S)(uint want = 1, const scope S* args = null) @trusted + if (is(S == sigset_t) || is(S == io_uring_getevents_arg)) + in (payload !is null, "Uring hasn't been initialized yet") + in (want > 0, "Invalid want value") + { + pragma(inline); + if (payload.cq.length >= want) return 0; // we don't need to syscall + EnterFlags flags = EnterFlags.GETEVENTS; + static if (is(S == io_uring_getevents_arg)) + flags |= EnterFlags.EXT_ARG; + immutable r = io_uring_enter(payload.fd, 0, want, flags, args); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /// ditto + int wait(uint want = 1) + { + pragma(inline, true) + return wait(want, cast(sigset_t*)null); + } + + /** + * Special case of a `submit` that can be used when we know beforehead that we want to wait for + * some amount of CQEs. + */ + int submitAndWait(S)(uint want, const scope S* args) @trusted + if (is(S == sigset_t) || is(S == io_uring_getevents_arg)) + in (payload !is null, "Uring hasn't been initialized yet") + in (want > 0, "Invalid want value") + { + immutable len = cast(int)payload.sq.length; + if (_expect(len > 0, true)) // anything to submit? + { + payload.sq.flushTail(); // advance queue index + + EnterFlags flags = EnterFlags.GETEVENTS; + if (payload.params.flags & SetupFlags.SQPOLL) + { + if (_expect(payload.sq.flags & SubmissionQueueFlags.NEED_WAKEUP, false)) + flags |= EnterFlags.SQ_WAKEUP; + } + static if (is(S == io_uring_getevents_arg)) + flags |= EnterFlags.EXT_ARG; + + immutable r = io_uring_enter(payload.fd, len, want, flags, args); + if (_expect(r < 0, false)) return -errno; + return r; + } + return wait(want); // just simple wait + } + + /// ditto + int submitAndWait(uint want) @safe + { + pragma(inline, true) + return submitAndWait(want, cast(sigset_t*)null); + } + + /** + * Register single buffer to be mapped into the kernel for faster buffered operations. + * + * To use the buffers, the application must specify the fixed variants for of operations, + * `READ_FIXED` or `WRITE_FIXED` in the `SubmissionEntry` also with used `buf_index` set + * in entry extra data. + * + * An application can increase or decrease the size or number of registered buffers by first + * unregistering the existing buffers, and then issuing a new call to io_uring_register() with + * the new buffers. + * + * Params: + * buffer = Buffers to be registered + * + * Returns: On success, returns 0. On error, `-errno` is returned. + */ + int registerBuffers(T)(T buffers) + if (is(T == ubyte[]) || is(T == ubyte[][])) // TODO: something else? + in (payload !is null, "Uring hasn't been initialized yet") + in (buffers.length, "Empty buffer") + { + if (payload.regBuffers !is null) + return -EBUSY; // buffers were already registered + + static if (is(T == ubyte[])) + { + auto p = malloc(iovec.sizeof); + if (_expect(p is null, false)) return -errno; + payload.regBuffers = (cast(iovec*)p)[0..1]; + payload.regBuffers[0].iov_base = cast(void*)&buffers[0]; + payload.regBuffers[0].iov_len = buffers.length; + } + else static if (is(T == ubyte[][])) + { + auto p = malloc(buffers.length * iovec.sizeof); + if (_expect(p is null, false)) return -errno; + payload.regBuffers = (cast(iovec*)p)[0..buffers.length]; + + foreach (i, b; buffers) + { + assert(b.length, "Empty buffer"); + payload.regBuffers[i].iov_base = cast(void*)&b[0]; + payload.regBuffers[i].iov_len = b.length; + } + } + + immutable r = io_uring_register( + payload.fd, + RegisterOpCode.REGISTER_BUFFERS, + cast(const(void)*)payload.regBuffers.ptr, 1 + ); + + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Releases all previously registered buffers associated with the `io_uring` instance. + * + * An application need not unregister buffers explicitly before shutting down the io_uring instance. + * + * Returns: On success, returns 0. On error, `-errno` is returned. + */ + int unregisterBuffers() @trusted + in (payload !is null, "Uring hasn't been initialized yet") + { + if (payload.regBuffers is null) + return -ENXIO; // no buffers were registered + + free(cast(void*)&payload.regBuffers[0]); + payload.regBuffers = null; + + immutable r = io_uring_register(payload.fd, RegisterOpCode.UNREGISTER_BUFFERS, null, 0); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Register files for I/O. + * + * To make use of the registered files, the `IOSQE_FIXED_FILE` flag must be set in the flags + * member of the `SubmissionEntry`, and the `fd` member is set to the index of the file in the + * file descriptor array. + * + * Files are automatically unregistered when the `io_uring` instance is torn down. An application + * need only unregister if it wishes to register a new set of fds. + * + * Use `-1` as a file descriptor to mark it as reserved in the array.* + * Params: fds = array of file descriptors to be registered + * + * Returns: On success, returns 0. On error, `-errno` is returned. + */ + int registerFiles(const(int)[] fds) + in (payload !is null, "Uring hasn't been initialized yet") + in (fds.length, "No file descriptors provided") + in (fds.length < uint.max, "Too many file descriptors") + { + // arg contains a pointer to an array of nr_args file descriptors (signed 32 bit integers). + immutable r = io_uring_register(payload.fd, RegisterOpCode.REGISTER_FILES, &fds[0], cast(uint)fds.length); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /* + * Register an update for an existing file set. The updates will start at + * `off` in the original array. + * + * Use `-1` as a file descriptor to mark it as reserved in the array. + * + * Params: + * off = offset to the original registered files to be updated + * files = array of file descriptors to update with + * + * Returns: number of files updated on success, -errno on failure. + */ + int registerFilesUpdate(uint off, const(int)[] fds) @trusted + in (payload !is null, "Uring hasn't been initialized yet") + in (fds.length, "No file descriptors provided to update") + in (fds.length < uint.max, "Too many file descriptors") + { + struct Update // represents io_uring_files_update (obsolete) or io_uring_rsrc_update + { + uint offset; + uint _resv; + ulong data; + } + + static assert (Update.sizeof == 16); + + Update u = { offset: off, data: cast(ulong)&fds[0] }; + immutable r = io_uring_register( + payload.fd, + RegisterOpCode.REGISTER_FILES_UPDATE, + &u, cast(uint)fds.length); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * All previously registered files associated with the `io_uring` instance will be unregistered. + * + * Files are automatically unregistered when the `io_uring` instance is torn down. An application + * need only unregister if it wishes to register a new set of fds. + * + * Returns: On success, returns 0. On error, `-errno` is returned. + */ + int unregisterFiles() @trusted + in (payload !is null, "Uring hasn't been initialized yet") + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.UNREGISTER_FILES, null, 0); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Registers event file descriptor that would be used as a notification mechanism on completion + * queue change. + * + * Params: eventFD = event filedescriptor to be notified about change + * + * Returns: On success, returns 0. On error, `-errno` is returned. + */ + int registerEventFD(int eventFD) @trusted + in (payload !is null, "Uring hasn't been initialized yet") + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.REGISTER_EVENTFD, &eventFD, 1); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Unregister previously registered notification event file descriptor. + * + * Returns: On success, returns 0. On error, `-errno` is returned. + */ + int unregisterEventFD() @trusted + in (payload !is null, "Uring hasn't been initialized yet") + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.UNREGISTER_EVENTFD, null, 0); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Generic means to register resources. + * + * Note: Available from Linux 5.13 + */ + int registerRsrc(R)(RegisterOpCode type, const(R)[] data, const(ulong)[] tags) + in (payload !is null, "Uring hasn't been initialized yet") + in (data.length == tags.length, "Different array lengths") + in (data.length < uint.max, "Too many resources") + { + struct Register // represents io_uring_rsrc_register + { + uint nr; + uint flags; + ulong _resv2; + ulong data; + ulong tags; + } + + static assert (Register.sizeof == 32); + + Register r = { + nr: cast(uint)data.length, + data: cast(ulong)&data[0], + tags: cast(ulong)&tags[0] + }; + immutable ret = io_uring_register( + payload.fd, + type, + &r, sizeof(r)); + if (_expect(ret < 0, false)) return -errno; + return 0; + } + + /** + * Generic means to update registered resources. + * + * Note: Available from Linux 5.13 + */ + int registerRsrcUpdate(R)(RegisterOpCode type, uint off, const(R)[] data, const(ulong)[] tags) @trusted + in (payload !is null, "Uring hasn't been initialized yet") + in (data.length == tags.length, "Different array lengths") + in (data.length < uint.max, "Too many file descriptors") + { + struct Update // represents io_uring_rsrc_update2 + { + uint offset; + uint _resv; + ulong data; + ulong tags; + uint nr; + uint _resv2; + } + + static assert (Update.sizeof == 32); + + Update u = { + offset: off, + data: cast(ulong)&data[0], + tags: cast(ulong)&tags[0], + nr: cast(uint)data.length, + }; + immutable r = io_uring_register(payload.fd, type, &u, sizeof(u)); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Register a feature whitelist. Attempting to call any operations which are not whitelisted + * will result in an error. + * + * Note: Can only be called once to prevent other code from bypassing the whitelist. + * + * Params: res = the struct containing the restriction info. + * + * Returns: On success, returns 0. On error, `-errno` is returned. + * + * Note: Available from Linux 5.10 + */ + int registerRestrictions(scope ref io_uring_restriction res) @trusted + in (payload !is null, "Uring hasn't been initialized yet") + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.REGISTER_RESTRICTIONS, &res, 1); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Enable the "rings" of this Uring if they were previously disabled with + * `IORING_SETUP_R_DISABLED`. + * + * Returns: On success, returns 0. On error, `-errno` is returned. + * + * Note: Available from Linux 5.10 + */ + int enableRings() @trusted + in (payload !is null, "Uring hasn't been initialized yet") + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.ENABLE_RINGS, null, 0); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * By default, async workers created by io_uring will inherit the CPU mask of its parent. This + * is usually all the CPUs in the system, unless the parent is being run with a limited set. If + * this isn't the desired outcome, the application may explicitly tell io_uring what CPUs the + * async workers may run on. + * + * Note: Available since 5.14. + */ + int registerIOWQAffinity(cpu_set_t[] cpus) @trusted + in (cpus.length) + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.REGISTER_IOWQ_AFF, cpus.ptr, cast(uint)cpus.length); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * Undoes a CPU mask previously set with `registerIOWQAffinity`. + * + * Note: Available since 5.14 + */ + int unregisterIOWQAffinity() @trusted + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.UNREGISTER_IOWQ_AFF, null, 0); + if (_expect(r < 0, false)) return -errno; + return 0; + } + + /** + * By default, io_uring limits the unbounded workers created to the maximum processor count set + * by `RLIMIT_NPROC` and the bounded workers is a function of the SQ ring size and the number of + * CPUs in the system. Sometimes this can be excessive (or too little, for bounded), and this + * command provides a way to change the count per ring (per NUMA node) instead. + * + * `val` must be set to an `uint` pointer to an array of two values, with the values in the + * array being set to the maximum count of workers per NUMA node. Index 0 holds the bounded + * worker count, and index 1 holds the unbounded worker count. On successful return, the passed + * in array will contain the previous maximum values for each type. If the count being passed in + * is 0, then this command returns the current maximum values and doesn't modify the current + * setting. + * + * Note: Available since 5.15 + */ + int registerIOWQMaxWorkers(ref uint[2] workers) @trusted + { + immutable r = io_uring_register(payload.fd, RegisterOpCode.REGISTER_IOWQ_MAX_WORKERS, &workers, 2); + if (_expect(r < 0, false)) return -errno; + return 0; + } +} + +/** + * Uses custom operation definition to fill fields of `SubmissionEntry`. + * Can be used in cases, when builtin prep* functions aren't enough. + * + * Custom definition fields must correspond to fields of `SubmissionEntry` for this to work. + * + * Note: This doesn't touch previous state of the entry, just fills the corresponding fields. + * So it might be needed to call `clear` first on the entry (depends on usage). + * + * Params: + * entry = entry to set parameters to + * op = operation to fill entry with (can be custom type) + */ +ref SubmissionEntry fill(E)(return ref SubmissionEntry entry, auto ref E op) +{ + pragma(inline); + import std.traits : hasMember, FieldNameTuple; + + // fill entry from provided operation fields (they must have same name as in SubmissionEntry) + foreach (m; FieldNameTuple!E) + { + static assert(hasMember!(SubmissionEntry, m), "unknown member: " ~ E.stringof ~ "." ~ m); + __traits(getMember, entry, m) = __traits(getMember, op, m); + } + + return entry; +} + +/** + * Template function to help set `SubmissionEntry` `user_data` field. + * + * Params: + * entry = `SubmissionEntry` to prepare + * data = data to set to the `SubmissionEntry` + * + * Note: data are passed by ref and must live during whole operation. + */ +ref SubmissionEntry setUserData(D)(return ref SubmissionEntry entry, ref D data) @trusted +{ + pragma(inline); + entry.user_data = cast(ulong)(cast(void*)&data); + return entry; +} + +/** + * Template function to help set `SubmissionEntry` `user_data` field. This differs to `setUserData` + * in that it emplaces the provided data directly into SQE `user_data` field and not the pointer to + * the data. + * + * Because of that, data must be of `ulong.sizeof`. + */ +ref SubmissionEntry setUserDataRaw(D)(return ref SubmissionEntry entry, auto ref D data) @trusted + if (D.sizeof == ulong.sizeof) +{ + pragma(inline); + entry.user_data = *(cast(ulong*)(cast(void*)&data)); + return entry; +} + +/** + * Helper function to retrieve data set directly to the `CompletionEntry` user_data (set by `setUserDataRaw`). + */ +D userDataAs(D)(ref CompletionEntry entry) @trusted + if (D.sizeof == ulong.sizeof) +{ + pragma(inline); + return *(cast(D*)(cast(void*)&entry.user_data)); +} + +ref SubmissionEntry prepRW(return ref SubmissionEntry entry, Operation op, + int fd = -1, const void* addr = null, uint len = 0, ulong offset = 0) @safe +{ + pragma(inline); + entry.opcode = op; + entry.fd = fd; + entry.off = offset; + entry.flags = SubmissionEntryFlags.NONE; + entry.ioprio = 0; + entry.addr = cast(ulong)addr; + entry.len = len; + entry.rw_flags = ReadWriteFlags.NONE; + entry.user_data = 0; + entry.buf_index = 0; + entry.personality = 0; + entry.file_index = 0; + entry.addr3 = 0; + entry.__pad2[0] = 0; + return entry; +} + +/** + * Prepares `nop` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + */ +ref SubmissionEntry prepNop(return ref SubmissionEntry entry) @safe +{ + entry.prepRW(Operation.NOP); + return entry; +} + +/** + * Prepares `readv` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor of file we are operating on + * offset = offset + * buffer = iovec buffers to be used by the operation + */ +ref SubmissionEntry prepReadv(V)(return ref SubmissionEntry entry, int fd, ref const V buffer, long offset) @trusted + if (is(V == iovec[]) || is(V == iovec)) +{ + static if (is(V == iovec[])) + { + assert(buffer.length, "Empty buffer"); + assert(buffer.length < uint.max, "Too many iovec buffers"); + return entry.prepRW(Operation.READV, fd, cast(void*)&buffer[0], cast(uint)buffer.length, offset); + } + else return entry.prepRW(Operation.READV, fd, cast(void*)&buffer, 1, offset); +} + +/** + * Prepares `writev` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor of file we are operating on + * offset = offset + * buffer = iovec buffers to be used by the operation + */ +ref SubmissionEntry prepWritev(V)(return ref SubmissionEntry entry, int fd, ref const V buffer, long offset) @trusted + if ((is(Unqual!V == U[], U) && is(Unqual!U == iovec)) || is(Unqual!V == iovec)) +{ + static if (is(typeof(buffer.length))) + { + assert(buffer.length, "Empty buffer"); + assert(buffer.length < uint.max, "Too many iovec buffers"); + return entry.prepRW(Operation.WRITEV, fd, cast(void*)&buffer[0], cast(uint)buffer.length, offset); + } + else return entry.prepRW(Operation.WRITEV, fd, cast(void*)&buffer, 1, offset); +} + +/** + * Prepares `read_fixed` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor of file we are operating on + * offset = offset + * buffer = slice to preregistered buffer + * bufferIndex = index to the preregistered buffers array buffer belongs to + */ +ref SubmissionEntry prepReadFixed(return ref SubmissionEntry entry, int fd, long offset, ubyte[] buffer, ushort bufferIndex) @safe +{ + assert(buffer.length, "Empty buffer"); + assert(buffer.length < uint.max, "Buffer too large"); + entry.prepRW(Operation.READ_FIXED, fd, cast(void*)&buffer[0], cast(uint)buffer.length, offset); + entry.buf_index = bufferIndex; + return entry; +} + +/** + * Prepares `write_fixed` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor of file we are operating on + * offset = offset + * buffer = slice to preregistered buffer + * bufferIndex = index to the preregistered buffers array buffer belongs to + */ +ref SubmissionEntry prepWriteFixed(return ref SubmissionEntry entry, int fd, long offset, ubyte[] buffer, ushort bufferIndex) @safe +{ + assert(buffer.length, "Empty buffer"); + assert(buffer.length < uint.max, "Buffer too large"); + entry.prepRW(Operation.WRITE_FIXED, fd, cast(void*)&buffer[0], cast(uint)buffer.length, offset); + entry.buf_index = bufferIndex; + return entry; +} + +/** + * Prepares `recvmsg(2)` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor of file we are operating on + * msg = message to operate with + * flags = `recvmsg` operation flags + * + * Note: Available from Linux 5.3 + * + * See_Also: `recvmsg(2)` man page for details. + */ +ref SubmissionEntry prepRecvMsg(return ref SubmissionEntry entry, int fd, ref msghdr msg, MsgFlags flags = MsgFlags.NONE) @trusted +{ + entry.prepRW(Operation.RECVMSG, fd, cast(void*)&msg, 1, 0); + entry.msg_flags = flags; + return entry; +} + +/** + * Prepares `sendmsg(2)` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor of file we are operating on + * msg = message to operate with + * flags = `sendmsg` operation flags + * + * Note: Available from Linux 5.3 + * + * See_Also: `sendmsg(2)` man page for details. + */ +ref SubmissionEntry prepSendMsg(return ref SubmissionEntry entry, int fd, ref msghdr msg, MsgFlags flags = MsgFlags.NONE) @trusted +{ + entry.prepRW(Operation.SENDMSG, fd, cast(void*)&msg, 1, 0); + entry.msg_flags = flags; + return entry; +} + +/** + * Prepares `fsync` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor of a file to call `fsync` on + * flags = `fsync` operation flags + */ +ref SubmissionEntry prepFsync(return ref SubmissionEntry entry, int fd, FsyncFlags flags = FsyncFlags.NORMAL) @safe +{ + entry.prepRW(Operation.FSYNC, fd); + entry.fsync_flags = flags; + return entry; +} + +/** + * Poll the fd specified in the submission queue entry for the events specified in the poll_events + * field. Unlike poll or epoll without `EPOLLONESHOT`, this interface always works in one shot mode. + * That is, once the poll operation is completed, it will have to be resubmitted. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = file descriptor to poll + * events = events to poll on the FD + * flags = poll operation flags + */ +ref SubmissionEntry prepPollAdd(return ref SubmissionEntry entry, + int fd, PollEvents events, PollFlags flags = PollFlags.NONE) @safe +{ + import std.system : endian, Endian; + + entry.prepRW(Operation.POLL_ADD, fd, null, flags); + static if (endian == Endian.bigEndian) + entry.poll_events32 = (events & 0x0000ffffUL) << 16 | (events & 0xffff0000) >> 16; + else + entry.poll_events32 = events; + return entry; +} + +/** + * Remove an existing poll request. If found, the res field of the `CompletionEntry` will contain + * `0`. If not found, res will contain `-ENOENT`. + * + * Params: + * entry = `SubmissionEntry` to prepare + * userData = data with the previously issued poll operation + */ +ref SubmissionEntry prepPollRemove(D)(return ref SubmissionEntry entry, ref D userData) @trusted +{ + return entry.prepRW(Operation.POLL_REMOVE, -1, cast(void*)&userData); +} + +/** + * Allow events and user_data update of running poll requests. + * + * Note: available from Linux 5.13 + */ +ref SubmissionEntry prepPollUpdate(U, V)(return ref SubmissionEntry entry, + ref U oldUserData, ref V newUserData, PollEvents events = PollEvents.NONE) @trusted +{ + import std.system : endian, Endian; + + PollFlags flags; + if (events != PollEvents.NONE) flags |= PollFlags.UPDATE_EVENTS; + if (cast(void*)&oldUserData !is cast(void*)&newUserData) flags |= PollFlags.UPDATE_USER_DATA; + + entry.prepRW( + Operation.POLL_REMOVE, + -1, + cast(void*)&oldUserData, + flags, + cast(ulong)cast(void*)&newUserData + ); + static if (endian == Endian.bigEndian) + entry.poll_events32 = (events & 0x0000ffffUL) << 16 | (events & 0xffff0000) >> 16; + else + entry.poll_events32 = events; + return entry; +} + +/** + * Prepares `sync_file_range(2)` operation. + * + * Sync a file segment with disk, permits fine control when synchronizing the open file referred to + * by the file descriptor fd with disk. + * + * If `len` is 0, then all bytes from `offset` through to the end of file are synchronized. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = is the file descriptor to sync + * offset = the starting byte of the file range to be synchronized + * len = the length of the range to be synchronized, in bytes + * flags = the flags for the command. + * + * See_Also: `sync_file_range(2)` for the general description of the related system call. + * + * Note: available from Linux 5.2 + */ +ref SubmissionEntry prepSyncFileRange(return ref SubmissionEntry entry, int fd, ulong offset, uint len, + SyncFileRangeFlags flags = SyncFileRangeFlags.WRITE_AND_WAIT) @safe +{ + entry.prepRW(Operation.SYNC_FILE_RANGE, fd, null, len, offset); + entry.sync_range_flags = flags; + return entry; +} + +/** + * This command will register a timeout operation. + * + * A timeout will trigger a wakeup event on the completion ring for anyone waiting for events. A + * timeout condition is met when either the specified timeout expires, or the specified number of + * events have completed. Either condition will trigger the event. The request will complete with + * `-ETIME` if the timeout got completed through expiration of the timer, or `0` if the timeout got + * completed through requests completing on their own. If the timeout was cancelled before it + * expired, the request will complete with `-ECANCELED`. + * + * Applications may delete existing timeouts before they occur with `TIMEOUT_REMOVE` operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * time = reference to `time64` data structure + * count = completion event count + * flags = define if it's a relative or absolute time + * + * Note: Available from Linux 5.4 + */ +ref SubmissionEntry prepTimeout(return ref SubmissionEntry entry, ref KernelTimespec time, + ulong count = 0, TimeoutFlags flags = TimeoutFlags.REL) @trusted +{ + entry.prepRW(Operation.TIMEOUT, -1, cast(void*)&time, 1, count); + entry.timeout_flags = flags; + return entry; +} + +/** + * Prepares operations to remove existing timeout registered using `TIMEOUT`operation. + * + * Attempt to remove an existing timeout operation. If the specified timeout request is found and + * cancelled successfully, this request will terminate with a result value of `-ECANCELED`. If the + * timeout request was found but expiration was already in progress, this request will terminate + * with a result value of `-EALREADY`. If the timeout request wasn't found, the request will + * terminate with a result value of `-ENOENT`. + * + * Params: + * entry = `SubmissionEntry` to prepare + * userData = user data provided with the previously issued timeout operation + * + * Note: Available from Linux 5.5 + */ +ref SubmissionEntry prepTimeoutRemove(D)(return ref SubmissionEntry entry, ref D userData) @trusted +{ + return entry.prepRW(Operation.TIMEOUT_REMOVE, -1, cast(void*)&userData); +} + +/** + * Prepares operations to update existing timeout registered using `TIMEOUT`operation. + * + * Params: + * entry = `SubmissionEntry` to prepare + * userData = user data provided with the previously issued timeout operation + * time = reference to `time64` data structure with a new time spec + * flags = define if it's a relative or absolute time + * + * Note: Available from Linux 5.11 + */ +ref SubmissionEntry prepTimeoutUpdate(D)(return ref SubmissionEntry entry, + ref KernelTimespec time, ref D userData, TimeoutFlags flags) @trusted +{ + entry.prepRW(Operation.TIMEOUT_REMOVE, -1, cast(void*)&userData, 0, cast(ulong)(cast(void*)&time)); + entry.timeout_flags = flags | TimeoutFlags.UPDATE; + return entry; +} + +/** + * Prepares `accept4(2)` operation. + * + * See_Also: `accept4(2)`` for the general description of the related system call. + * + * Params: + * entry = `SubmissionEntry` to prepare + * fd = socket file descriptor + * addr = reference to one of sockaddr structires to be filled with accepted client address + * addrlen = reference to addrlen field that would be filled with accepted client address length + * + * Note: Available from Linux 5.5 + */ +ref SubmissionEntry prepAccept(ADDR)(return ref SubmissionEntry entry, int fd, ref ADDR addr, ref socklen_t addrlen, + AcceptFlags flags = AcceptFlags.NONE) @trusted +{ + entry.prepRW(Operation.ACCEPT, fd, cast(void*)&addr, 0, cast(ulong)(cast(void*)&addrlen)); + entry.accept_flags = flags; + return entry; +} + +/** + * Same as `prepAccept`, but fd is put directly into fixed file table on `fileIndex`. + * Note: available from Linux 5.15 + */ +ref SubmissionEntry prepAcceptDirect(ADDR)(return ref SubmissionEntry entry, int fd, ref ADDR addr, ref socklen_t addrlen, + uint fileIndex, AcceptFlags flags = AcceptFlags.NONE) @trusted +{ + entry.prepRW(Operation.ACCEPT, fd, cast(void*)&addr, 0, cast(ulong)(cast(void*)&addrlen)); + entry.accept_flags = flags; + entry.file_index = fileIndex+1; + return entry; +} + +/** + * Prepares operation that cancels existing async work. + * + * This works with any read/write request, accept,send/recvmsg, etc. There’s an important + * distinction to make here with the different kinds of commands. A read/write on a regular file + * will generally be waiting for IO completion in an uninterruptible state. This means it’ll ignore + * any signals or attempts to cancel it, as these operations are uncancellable. io_uring can cancel + * these operations if they haven’t yet been started. If they have been started, cancellations on + * these will fail. Network IO will generally be waiting interruptibly, and can hence be cancelled + * at any time. The completion event for this request will have a result of 0 if done successfully, + * `-EALREADY` if the operation is already in progress, and `-ENOENT` if the original request + * specified cannot be found. For cancellation requests that return `-EALREADY`, io_uring may or may + * not cause this request to be stopped sooner. For blocking IO, the original request will complete + * as it originally would have. For IO that is cancellable, it will terminate sooner if at all + * possible. + * + * Params: + * entry = `SubmissionEntry` to prepare + * userData = `user_data` field of the request that should be cancelled + * + * Note: Available from Linux 5.5 + */ +ref SubmissionEntry prepCancel(D)(return ref SubmissionEntry entry, ref D userData, uint flags = 0) @trusted +{ + entry.prepRW(Operation.ASYNC_CANCEL, -1, cast(void*)&userData); + entry.cancel_flags = cast(CancelFlags)flags; + return entry; +} + +/** + * Prepares linked timeout operation. + * + * This request must be linked with another request through `IOSQE_IO_LINK` which is described below. + * Unlike `IORING_OP_TIMEOUT`, `IORING_OP_LINK_TIMEOUT` acts on the linked request, not the completion + * queue. The format of the command is otherwise like `IORING_OP_TIMEOUT`, except there's no + * completion event count as it's tied to a specific request. If used, the timeout specified in the + * command will cancel the linked command, unless the linked command completes before the + * timeout. The timeout will complete with `-ETIME` if the timer expired and the linked request was + * attempted cancelled, or `-ECANCELED` if the timer got cancelled because of completion of the linked + * request. + * + * Note: Available from Linux 5.5 + * + * Params: + * entry = `SubmissionEntry` to prepare + * time = time specification + * flags = define if it's a relative or absolute time + */ +ref SubmissionEntry prepLinkTimeout(return ref SubmissionEntry entry, ref KernelTimespec time, TimeoutFlags flags = TimeoutFlags.REL) @trusted +{ + entry.prepRW(Operation.LINK_TIMEOUT, -1, cast(void*)&time, 1, 0); + entry.timeout_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.5 + */ +ref SubmissionEntry prepConnect(ADDR)(return ref SubmissionEntry entry, int fd, ref const(ADDR) addr) @trusted +{ + return entry.prepRW(Operation.CONNECT, fd, cast(void*)&addr, 0, ADDR.sizeof); +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepFilesUpdate(return ref SubmissionEntry entry, int[] fds, int offset) @safe +{ + return entry.prepRW(Operation.FILES_UPDATE, -1, cast(void*)&fds[0], cast(uint)fds.length, offset); +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepFallocate(return ref SubmissionEntry entry, int fd, int mode, long offset, long len) @trusted +{ + return entry.prepRW(Operation.FALLOCATE, fd, cast(void*)len, mode, offset); +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepOpenat(return ref SubmissionEntry entry, int fd, const(char)* path, int flags, uint mode) +{ + entry.prepRW(Operation.OPENAT, fd, cast(void*)path, mode, 0); + entry.open_flags = flags; + return entry; +} + +/** + * Same as `prepOpenat`, but fd is put directly into fixed file table on `fileIndex`. + * Note: available from Linux 5.15 + */ +ref SubmissionEntry prepOpenatDirect(return ref SubmissionEntry entry, int fd, const(char)* path, int flags, uint mode, uint fileIndex) +{ + entry.prepRW(Operation.OPENAT, fd, cast(void*)path, mode, 0); + entry.open_flags = flags; + entry.file_index = fileIndex+1; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepClose(return ref SubmissionEntry entry, int fd) @safe +{ + return entry.prepRW(Operation.CLOSE, fd); +} + +/** + * Same as `prepClose` but operation works directly with fd registered in fixed file table on index `fileIndex`. + * Note: Available from Linux 5.15 + */ +ref SubmissionEntry prepCloseDirect(return ref SubmissionEntry entry, int fd, uint fileIndex) @safe +{ + entry.prepRW(Operation.CLOSE, fd); + entry.file_index = fileIndex+1; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepRead(return ref SubmissionEntry entry, int fd, ubyte[] buffer, long offset) @safe +{ + return entry.prepRW(Operation.READ, fd, cast(void*)&buffer[0], cast(uint)buffer.length, offset); +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepWrite(return ref SubmissionEntry entry, int fd, const(ubyte)[] buffer, long offset) @trusted +{ + return entry.prepRW(Operation.WRITE, fd, cast(void*)&buffer[0], cast(uint)buffer.length, offset); +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepStatx(Statx)(return ref SubmissionEntry entry, int fd, const(char)* path, + int flags, uint mask, ref Statx statxbuf) +{ + entry.prepRW(Operation.STATX, fd, cast(void*)path, mask, cast(ulong)(cast(void*)&statxbuf)); + entry.statx_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepFadvise(return ref SubmissionEntry entry, int fd, long offset, uint len, int advice) @safe +{ + entry.prepRW(Operation.FADVISE, fd, null, len, offset); + entry.fadvise_advice = advice; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepMadvise(return ref SubmissionEntry entry, const(ubyte)[] block, int advice) @trusted +{ + entry.prepRW(Operation.MADVISE, -1, cast(void*)&block[0], cast(uint)block.length, 0); + entry.fadvise_advice = advice; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepSend(return ref SubmissionEntry entry, + int sockfd, const(ubyte)[] buf, MsgFlags flags = MsgFlags.NONE) @trusted +{ + entry.prepRW(Operation.SEND, sockfd, cast(void*)&buf[0], cast(uint)buf.length, 0); + entry.msg_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepRecv(return ref SubmissionEntry entry, + int sockfd, ubyte[] buf, MsgFlags flags = MsgFlags.NONE) @trusted +{ + entry.prepRW(Operation.RECV, sockfd, cast(void*)&buf[0], cast(uint)buf.length, 0); + entry.msg_flags = flags; + return entry; +} + +/** + * Variant that uses registered buffers group. + * + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepRecv(return ref SubmissionEntry entry, + int sockfd, ushort gid, uint len, MsgFlags flags = MsgFlags.NONE) @safe +{ + entry.prepRW(Operation.RECV, sockfd, null, len, 0); + entry.msg_flags = flags; + entry.buf_group = gid; + entry.flags |= SubmissionEntryFlags.BUFFER_SELECT; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepOpenat2(return ref SubmissionEntry entry, int fd, const char *path, ref OpenHow how) @trusted +{ + return entry.prepRW(Operation.OPENAT2, fd, cast(void*)path, cast(uint)OpenHow.sizeof, cast(ulong)(cast(void*)&how)); +} + +/** + * Same as `prepOpenat2`, but fd is put directly into fixed file table on `fileIndex`. + * Note: available from Linux 5.15 + */ + ref SubmissionEntry prepOpenat2Direct(return ref SubmissionEntry entry, int fd, const char *path, ref OpenHow how, uint fileIndex) @trusted +{ + entry.prepRW(Operation.OPENAT2, fd, cast(void*)path, cast(uint)OpenHow.sizeof, cast(ulong)(cast(void*)&how)); + entry.file_index = fileIndex+1; + return entry; +} + +/** + * Note: Available from Linux 5.6 + */ +ref SubmissionEntry prepEpollCtl(return ref SubmissionEntry entry, int epfd, int fd, int op, ref epoll_event ev) @trusted +{ + return entry.prepRW(Operation.EPOLL_CTL, epfd, cast(void*)&ev, op, fd); +} + +/** + * Note: Available from Linux 5.7 + * + * This splice operation can be used to implement sendfile by splicing to an intermediate pipe + * first, then splice to the final destination. In fact, the implementation of sendfile in kernel + * uses splice internally. + * + * NOTE that even if fd_in or fd_out refers to a pipe, the splice operation can still fail with + * EINVAL if one of the fd doesn't explicitly support splice operation, e.g. reading from terminal + * is unsupported from kernel 5.7 to 5.11. Check issue #291 for more information. + * + * Either fd_in or fd_out must be a pipe. + * + * Params + * fd_in = input file descriptor + * off_in = If fd_in refers to a pipe, off_in must be -1. + * If fd_in does not refer to a pipe and off_in is -1, then bytes are read from + * fd_in starting from the file offset and it is adjust appropriately; + * If fd_in does not refer to a pipe and off_in is not -1, then the starting + * offset of fd_in will be off_in. + * fd_out = output filedescriptor + * off_out = The description of off_in also applied to off_out. + * len = Up to len bytes would be transfered between file descriptors. + * splice_flags = see man splice(2) for description of flags. + */ +ref SubmissionEntry prepSplice(return ref SubmissionEntry entry, + int fd_in, ulong off_in, + int fd_out, ulong off_out, + uint len, uint splice_flags) @safe +{ + entry.prepRW(Operation.SPLICE, fd_out, null, len, off_out); + entry.splice_off_in = off_in; + entry.splice_fd_in = fd_in; + entry.splice_flags = splice_flags; + return entry; +} + +/** + * Note: Available from Linux 5.7 + * + * Params: + * entry = `SubmissionEntry` to prepare + * buf = buffers to provide + * len = length of each buffer to add + * bgid = buffers group id + * bid = starting buffer id + */ +ref SubmissionEntry prepProvideBuffers(return ref SubmissionEntry entry, ubyte[][] buf, uint len, ushort bgid, int bid) @safe +{ + assert(buf.length <= int.max, "Too many buffers"); + assert(len <= uint.max, "Buffer too large"); + version (assert) { + foreach (b; buf) assert(b.length <= len, "Invalid buffer length"); + } + entry.prepRW(Operation.PROVIDE_BUFFERS, cast(int)buf.length, cast(void*)&buf[0][0], len, bid); + entry.buf_group = bgid; + return entry; +} + +/// ditto +ref SubmissionEntry prepProvideBuffers(size_t M, size_t N)(return ref SubmissionEntry entry, ref ubyte[M][N] buf, ushort bgid, int bid) @safe +{ + static assert(N <= int.max, "Too many buffers"); + static assert(M <= uint.max, "Buffer too large"); + entry.prepRW(Operation.PROVIDE_BUFFERS, cast(int)N, cast(void*)&buf[0][0], cast(uint)M, bid); + entry.buf_group = bgid; + return entry; +} + +/// ditto +ref SubmissionEntry prepProvideBuffer(size_t N)(return ref SubmissionEntry entry, ref ubyte[N] buf, ushort bgid, int bid) @safe +{ + static assert(N <= uint.max, "Buffer too large"); + entry.prepRW(Operation.PROVIDE_BUFFERS, 1, cast(void*)&buf[0], cast(uint)N, bid); + entry.buf_group = bgid; + return entry; +} + +/// ditto +ref SubmissionEntry prepProvideBuffer(return ref SubmissionEntry entry, ref ubyte[] buf, ushort bgid, int bid) @safe +{ + assert(buf.length <= uint.max, "Buffer too large"); + entry.prepRW(Operation.PROVIDE_BUFFERS, 1, cast(void*)&buf[0], cast(uint)buf.length, bid); + entry.buf_group = bgid; + return entry; +} + +/** + * Note: Available from Linux 5.7 + */ +ref SubmissionEntry prepRemoveBuffers(return ref SubmissionEntry entry, int nr, ushort bgid) @safe +{ + entry.prepRW(Operation.REMOVE_BUFFERS, nr); + entry.buf_group = bgid; + return entry; +} + +/** + * Note: Available from Linux 5.8 + */ +ref SubmissionEntry prepTee(return ref SubmissionEntry entry, int fd_in, int fd_out, uint nbytes, uint flags) @safe +{ + entry.prepRW(Operation.TEE, fd_out, null, nbytes, 0); + entry.splice_off_in = 0; + entry.splice_fd_in = fd_in; + entry.splice_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.11 + */ +ref SubmissionEntry prepShutdown(return ref SubmissionEntry entry, int fd, int how) @safe +{ + return entry.prepRW(Operation.SHUTDOWN, fd, null, how, 0); +} + +/** + * Note: Available from Linux 5.11 + */ +ref SubmissionEntry prepRenameat(return ref SubmissionEntry entry, + int olddfd, const(char)* oldpath, int newfd, const(char)* newpath, int flags) +{ + entry.prepRW(Operation.RENAMEAT, olddfd, cast(void*)oldpath, newfd, cast(ulong)cast(void*)newpath); + entry.rename_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.11 + */ +ref SubmissionEntry prepUnlinkat(return ref SubmissionEntry entry, int dirfd, const(char)* path, int flags) +{ + entry.prepRW(Operation.UNLINKAT, dirfd, cast(void*)path, 0, 0); + entry.unlink_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.15 + */ +ref SubmissionEntry prepMkdirat(return ref SubmissionEntry entry, int dirfd, const(char)* path, mode_t mode) +{ + entry.prepRW(Operation.MKDIRAT, dirfd, cast(void*)path, mode, 0); + return entry; +} + +/** + * Note: Available from Linux 5.15 + */ +ref SubmissionEntry prepSymlinkat(return ref SubmissionEntry entry, const(char)* target, int newdirfd, const(char)* linkpath) +{ + entry.prepRW(Operation.SYMLINKAT, newdirfd, cast(void*)target, 0, cast(ulong)cast(void*)linkpath); + return entry; +} + +/** + * Note: Available from Linux 5.15 + */ +ref SubmissionEntry prepLinkat(return ref SubmissionEntry entry, + int olddirfd, const(char)* oldpath, + int newdirfd, const(char)* newpath, int flags) +{ + entry.prepRW(Operation.LINKAT, olddirfd, cast(void*)oldpath, newdirfd, cast(ulong)cast(void*)newpath); + entry.hardlink_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.15 + */ +ref SubmissionEntry prepLink(return ref SubmissionEntry entry, + const(char)* oldpath, const(char)* newpath, int flags) +{ + return prepLinkat(entry, AT_FDCWD, oldpath, AT_FDCWD, newpath, flags); +} + +// ref SubmissionEntry prepMsgRingCqeFlags(return ref SubmissionEntry entry, +// int fd, uint len, ulong data, uint flags, uint cqe_flags) @trusted +// { + // io_uring_prep_rw(IORING_OP_MSG_RING, sqe, fd, NULL, len, data); + // sqe->msg_ring_flags = IORING_MSG_RING_FLAGS_PASS | flags; + // sqe->file_index = cqe_flags; +// } + +/** + * Note: Available from Linux 5.18 + */ +ref SubmissionEntry prepMsgRing(return ref SubmissionEntry entry, + int fd, uint len, ulong data, uint flags) +{ + entry.prepRW(Operation.MSG_RING, fd, null, len, data); + entry.msg_ring_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.19 + */ +ref SubmissionEntry prepGetxattr(return ref SubmissionEntry entry, + const(char)* name, char* value, const(char)* path, uint len) +{ + entry.prepRW(Operation.GETXATTR, 0, name, len, cast(ulong)cast(void*)value); + entry.addr3 = cast(ulong)cast(void*)path; + entry.xattr_flags = 0; + return entry; +} + +/** + * Note: Available from Linux 5.19 + */ +ref SubmissionEntry prepSetxattr(return ref SubmissionEntry entry, + const(char)* name, const(char)* value, const(char)* path, uint len, int flags) +{ + entry.prepRW(Operation.SETXATTR, 0, name, len, cast(ulong)cast(void*)value); + entry.addr3 = cast(ulong)cast(void*)path; + entry.xattr_flags = flags; + return entry; +} + +/** + * Note: Available from Linux 5.19 + */ +ref SubmissionEntry prepFgetxattr(return ref SubmissionEntry entry, + int fd, const(char)* name, char* value, uint len) +{ + entry.prepRW(Operation.FGETXATTR, fd, name, len, cast(ulong)cast(void*)value); + entry.xattr_flags = 0; + return entry; +} + +/** + * Note: Available from Linux 5.19 + */ +ref SubmissionEntry prepFsetxattr(return ref SubmissionEntry entry, + int fd, const(char)* name, const(char)* value, uint len, int flags) +{ + entry.prepRW(Operation.FSETXATTR, fd, name, len, cast(ulong)cast(void*)value); + entry.xattr_flags = flags; + return entry; +} + +ref SubmissionEntry prepSocket(return ref SubmissionEntry entry, + int domain, int type, int protocol, uint flags) +{ + entry.prepRW(Operation.SOCKET, domain, null, protocol, type); + entry.rw_flags = cast(ReadWriteFlags)flags; + return entry; +} + +private: + +// uring cleanup +void dispose(ref Uring uring) @trusted +{ + if (uring.payload is null) return; + // debug printf("uring(%d): dispose(%d)\n", uring.payload.fd, uring.payload.refs); + if (--uring.payload.refs == 0) + { + import std.traits : hasElaborateDestructor; + // debug printf("uring(%d): free\n", uring.payload.fd); + static if (hasElaborateDestructor!UringDesc) + destroy(*uring.payload); // call possible destructors + free(cast(void*)uring.payload); + } + uring.payload = null; +} + +// system fields descriptor +struct UringDesc +{ + nothrow @nogc: + + int fd; + size_t refs; + SetupParameters params; + SubmissionQueue sq; + CompletionQueue cq; + + iovec[] regBuffers; + + ~this() @trusted + { + if (regBuffers) free(cast(void*)®Buffers[0]); + if (sq.ring) munmap(sq.ring, sq.ringSize); + if (sq.sqes) munmap(cast(void*)&sq.sqes[0], sq.sqes.length * SubmissionEntry.sizeof); + if (cq.ring && cq.ring != sq.ring) munmap(cq.ring, cq.ringSize); + close(fd); + } + + private auto mapRings() @trusted + { + sq.ringSize = params.sq_off.array + params.sq_entries * uint.sizeof; + cq.ringSize = params.cq_off.cqes + params.cq_entries * CompletionEntry.sizeof; + + if (params.features & SetupFeatures.SINGLE_MMAP) + { + if (cq.ringSize > sq.ringSize) sq.ringSize = cq.ringSize; + cq.ringSize = sq.ringSize; + } + + sq.ring = mmap(null, sq.ringSize, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + fd, SetupParameters.SUBMISSION_QUEUE_RING_OFFSET + ); + + if (sq.ring == MAP_FAILED) + { + sq.ring = null; + return -errno; + } + + if (params.features & SetupFeatures.SINGLE_MMAP) + cq.ring = sq.ring; + else + { + cq.ring = mmap(null, cq.ringSize, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + fd, SetupParameters.COMPLETION_QUEUE_RING_OFFSET + ); + + if (cq.ring == MAP_FAILED) + { + cq.ring = null; + return -errno; // cleanup is done in struct destructors + } + } + + uint entries = *cast(uint*)(sq.ring + params.sq_off.ring_entries); + sq.khead = cast(uint*)(sq.ring + params.sq_off.head); + sq.ktail = cast(uint*)(sq.ring + params.sq_off.tail); + sq.localTail = *sq.ktail; + sq.ringMask = *cast(uint*)(sq.ring + params.sq_off.ring_mask); + sq.kflags = cast(uint*)(sq.ring + params.sq_off.flags); + sq.kdropped = cast(uint*)(sq.ring + params.sq_off.dropped); + + // Indirection array of indexes to the sqes array (head and tail are pointing to this array). + // As we don't need some fancy mappings, just initialize it with constant indexes and forget about it. + // That way, head and tail are actually indexes to our sqes array. + foreach (i; 0..entries) + { + *((cast(uint*)(sq.ring + params.sq_off.array)) + i) = i; + } + + auto psqes = mmap( + null, entries * SubmissionEntry.sizeof, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + fd, SetupParameters.SUBMISSION_QUEUE_ENTRIES_OFFSET + ); + + if (psqes == MAP_FAILED) return -errno; + sq.sqes = (cast(SubmissionEntry*)psqes)[0..entries]; + + entries = *cast(uint*)(cq.ring + params.cq_off.ring_entries); + cq.khead = cast(uint*)(cq.ring + params.cq_off.head); + cq.localHead = *cq.khead; + cq.ktail = cast(uint*)(cq.ring + params.cq_off.tail); + cq.ringMask = *cast(uint*)(cq.ring + params.cq_off.ring_mask); + cq.koverflow = cast(uint*)(cq.ring + params.cq_off.overflow); + cq.cqes = (cast(CompletionEntry*)(cq.ring + params.cq_off.cqes))[0..entries]; + cq.kflags = cast(uint*)(cq.ring + params.cq_off.flags); + return 0; + } +} + +/// Wraper for `SubmissionEntry` queue +struct SubmissionQueue +{ + nothrow @nogc: + + // mmaped fields + uint* khead; // controlled by kernel + uint* ktail; // controlled by us + uint* kflags; // controlled by kernel (ie IORING_SQ_NEED_WAKEUP) + uint* kdropped; // counter of invalid submissions (out of bound index) + uint ringMask; // constant mask used to determine array index from head/tail + + // mmap details (for cleanup) + void* ring; // pointer to the mmaped region + size_t ringSize; // size of mmaped memory block + + // mmapped list of entries (fixed length) + SubmissionEntry[] sqes; + + uint localTail; // used for batch submission + + uint head() const @safe pure { return atomicLoad!(MemoryOrder.acq)(*khead); } + uint tail() const @safe pure { return localTail; } + + void flushTail() @safe pure + { + pragma(inline, true); + // debug printf("SQ updating tail: %d\n", localTail); + atomicStore!(MemoryOrder.rel)(*ktail, localTail); + } + + SubmissionQueueFlags flags() const @safe pure + { + return cast(SubmissionQueueFlags)atomicLoad!(MemoryOrder.raw)(*kflags); + } + + bool full() const @safe pure { return sqes.length == length; } + + size_t length() const @safe pure { return tail - head; } + + size_t capacity() const @safe pure { return sqes.length - length; } + + ref SubmissionEntry next()() @safe pure return + { + assert(!full, "SumbissionQueue is full"); + return sqes[localTail++ & ringMask]; + } + + void put()(auto ref SubmissionEntry entry) @safe pure + { + assert(!full, "SumbissionQueue is full"); + sqes[localTail++ & ringMask] = entry; + } + + void put(OP)(auto ref OP op) + if (!is(OP == SubmissionEntry)) + { + assert(!full, "SumbissionQueue is full"); + sqes[localTail++ & ringMask].fill(op); + } + + private void putWith(alias FN, ARGS...)(auto ref ARGS args) + { + import std.traits : Parameters, ParameterStorageClass, ParameterStorageClassTuple; + + static assert( + Parameters!FN.length >= 1 + && is(Parameters!FN[0] == SubmissionEntry) + && ParameterStorageClassTuple!FN[0] == ParameterStorageClass.ref_, + "Alias function must accept at least `ref SubmissionEntry`"); + + static assert( + is(typeof(FN(sqes[localTail & ringMask], args))), + "Provided function is not callable with " ~ (Parameters!((ref SubmissionEntry e, ARGS args) {})).stringof); + + assert(!full, "SumbissionQueue is full"); + FN(sqes[localTail++ & ringMask], args); + } + + uint dropped() const @safe pure { return atomicLoad!(MemoryOrder.raw)(*kdropped); } +} + +struct CompletionQueue +{ + nothrow @nogc: + + // mmaped fields + uint* khead; // controlled by us (increment after entry at head was read) + uint* ktail; // updated by kernel + uint* koverflow; + uint* kflags; + CompletionEntry[] cqes; // array of entries (fixed length) + + uint ringMask; // constant mask used to determine array index from head/tail + + // mmap details (for cleanup) + void* ring; + size_t ringSize; + + uint localHead; // used for bulk reading + + uint head() const @safe pure { return localHead; } + uint tail() const @safe pure { return atomicLoad!(MemoryOrder.acq)(*ktail); } + + void flushHead() @safe pure + { + pragma(inline, true); + // debug printf("CQ updating head: %d\n", localHead); + atomicStore!(MemoryOrder.rel)(*khead, localHead); + } + + bool empty() const @safe pure { return head == tail; } + + ref CompletionEntry front() @safe pure return + { + assert(!empty, "CompletionQueue is empty"); + return cqes[localHead & ringMask]; + } + + void popFront() @safe pure + { + pragma(inline); + assert(!empty, "CompletionQueue is empty"); + localHead++; + flushHead(); + } + + size_t length() const @safe pure { return tail - localHead; } + + uint overflow() const @safe pure { return atomicLoad!(MemoryOrder.raw)(*koverflow); } + + /// Runtime CQ flags - written by the application, shouldn't be modified by the kernel. + void flags(CQRingFlags flags) @safe pure { atomicStore!(MemoryOrder.raw)(*kflags, flags); } +} + +// just a helper to use atomicStore more easily with older compilers +void atomicStore(MemoryOrder ms, T, V)(ref T val, V newVal) @trusted +{ + pragma(inline, true); + import core.atomic : store = atomicStore; + static if (__VERSION__ >= 2089) store!ms(val, newVal); + else store!ms(*(cast(shared T*)&val), newVal); +} + +// just a helper to use atomicLoad more easily with older compilers +T atomicLoad(MemoryOrder ms, T)(ref const T val) @trusted +{ + pragma(inline, true); + import core.atomic : load = atomicLoad; + static if (__VERSION__ >= 2089) return load!ms(val); + else return load!ms(*(cast(const shared T*)&val)); +} + +version (assert) +{ + import std.range.primitives : ElementType, isInputRange, isOutputRange; + static assert(isInputRange!Uring && is(ElementType!Uring == CompletionEntry)); + static assert(isOutputRange!(Uring, SubmissionEntry)); +} + +version (LDC) +{ + import ldc.intrinsics : llvm_expect; + alias _expect = llvm_expect; +} +else +{ + T _expect(T)(T val, T expected_val) if (__traits(isIntegral, T)) + { + pragma(inline, true); + return val; + } +} \ No newline at end of file diff --git a/tests/ut/concurrency/io.d b/tests/ut/concurrency/io.d new file mode 100644 index 0000000..e5c0348 --- /dev/null +++ b/tests/ut/concurrency/io.d @@ -0,0 +1,85 @@ +module ut.concurrency.io; + +import unit_threaded; +import concurrency.io; +import concurrency.scheduler; +import concurrency; +import concurrency.operations; +import std.typecons : tuple; +import core.time : msecs; + +version (linux): + +@safe +@("Schedule.single") +unittest { + auto io = IOContext.construct(12); + io.run(Schedule().then(() => 1)) + .syncWait().value.should == 1; +} + +@safe +@("Schedule.double") +unittest { + auto io = IOContext.construct(12); + io.run( + whenAll( + Schedule().then(() => 1), + Schedule().then(() => 2) + ) + ).syncWait().value.should == tuple(1,2); +} + +@safe +@("ScheduleAfter.single") +unittest { + auto io = IOContext.construct(12); + io.run(ScheduleAfter(1.msecs).then(() => 1)) + .syncWait().value.should == 1; +} + +@safe +@("ScheduleAfter.double") +unittest { + auto io = IOContext.construct(12); + io.run( + whenAll( + ScheduleAfter(1.msecs).then(() => 1), + ScheduleAfter(1.msecs).then(() => 2) + ) + ).syncWait().value.should == tuple(1,2); +} + +@safe +@("acceptAsync.connectAsync") +unittest { + import concurrency.io.socket; + auto fd = listenTcp("127.0.0.1", 0); + auto socket = tcpSocket(); + auto port = fd.getPort(); + auto io = IOContext.construct(12); + + auto result = io.run( + whenAll( + acceptAsync(fd), + connectAsync(socket, "127.0.0.1", port), + ) + ).syncWait().assumeOk; + + auto client = result[0]; + + closeSocket(client.fd); + closeSocket(socket); + closeSocket(fd); +} + +@safe +@("acceptAsync.missing.ioscheduler") +unittest { + import concurrency.io.socket; + import concurrency.sender; + import std.socket; + acceptAsync(cast(socket_t)0) + .toSenderObject + .syncWait().value.shouldThrow; +} diff --git a/tests/ut/ut_runner.d b/tests/ut/ut_runner.d index dd60e41..274b634 100644 --- a/tests/ut/ut_runner.d +++ b/tests/ut/ut_runner.d @@ -20,5 +20,6 @@ int main(string[] args) { "concurrency.timingwheels", "concurrency.stoptoken", "ut.concurrency.stoptoken", + "ut.concurrency.io", ); }