From 63dc181f7cb8332362529e84f573cdf488a0b3f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lukas=20Backstr=C3=B6m?= <lukas@erlang.org>
Date: Fri, 20 Dec 2024 15:53:55 +0100
Subject: [PATCH 1/5] otp: Fix type and flavor handling when running tests

---
 make/test_target_script.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/make/test_target_script.sh b/make/test_target_script.sh
index 83e614135a9b..bdc3bf561d92 100755
--- a/make/test_target_script.sh
+++ b/make/test_target_script.sh
@@ -269,14 +269,16 @@ if [ -n "${FLAVOR}" ]; then
 fi
 
 # Compile test server and configure
-if [ ! -f "$ERL_TOP/lib/common_test/test_server/variables" ]; then
+if [ ! -f "$ERL_TOP/lib/common_test/test_server/variables.${TYPE}.${FLAVOR}" ]; then
     cd "$ERL_TOP/lib/common_test/test_server"
-    ( make && erl -noshell -eval "ts:install()." -s init stop )  > "$INSTALL_TEST_LOG" 2>&1
+    ( ${MAKE:-make} && erl -noshell -eval "ts:install()." -s init stop )  > "$INSTALL_TEST_LOG" 2>&1
     if [ $? != 0 ]
     then
         cat "$INSTALL_TEST_LOG"
         print_highlighted_msg $RED "\"make && erl -eval 'ts:install()'\" in common_test/test_server failed."
         exit 1
+    else
+        cp "$ERL_TOP/lib/common_test/test_server/variables" "$ERL_TOP/lib/common_test/test_server/variables.${TYPE}.${FLAVOR}"
     fi
 fi
 
@@ -284,7 +286,7 @@ fi
 cd $MAKE_TEST_REL_DIR
 
 erl -sname test -noshell -pa "$ERL_TOP/lib/common_test/test_server" \
-    -eval "ts:compile_datadirs(\"$ERL_TOP/lib/common_test/test_server/variables\",\"*_SUITE_data\")."\
+    -eval "ts:compile_datadirs(\"$ERL_TOP/lib/common_test/test_server/variables.${TYPE}.${FLAVOR}\",\"*_SUITE_data\")."\
     -s init stop > "$COMPILE_TEST_LOG" 2>&1
 
 if [ $? != 0 ]

From 5c55e571b93c0b563d2650d4bcf2a687c5d42397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lukas=20Backstr=C3=B6m?= <lukas@erlang.org>
Date: Fri, 6 Dec 2024 12:13:46 +0100
Subject: [PATCH 2/5] erts: Add Check I/O internal documentation

---
 erts/doc/Makefile                     |   4 +-
 erts/emulator/internal_doc/CheckIO.md | 162 ++++++++++++++++++++++++++
 2 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 erts/emulator/internal_doc/CheckIO.md

diff --git a/erts/doc/Makefile b/erts/doc/Makefile
index 929b1efcd2bd..d34207bc1ba5 100644
--- a/erts/doc/Makefile
+++ b/erts/doc/Makefile
@@ -40,7 +40,9 @@ assets/erl_uds_dist.erl: ../../lib/kernel/examples/erl_uds_dist/src/erl_uds_dist
 assets/time_compat.erl: ../example/time_compat.erl
 	$(V_at)cp $< $@
 
-$(HTMLDIR)/index.html: $(patsubst ../emulator/internal_doc/assets/%,assets/%,$(wildcard ../emulator/internal_doc/assets/*.png) $(wildcard ../emulator/internal_doc/assets/*.svg)) assets/gen_tcp_dist.erl assets/erl_uds_dist.erl assets/time_compat.erl
+$(HTMLDIR)/index.html: $(wildcard ../emulator/internal_doc/*.md) \
+	$(patsubst ../emulator/internal_doc/assets/%,assets/%,$(wildcard ../emulator/internal_doc/assets/*.png) $(wildcard ../emulator/internal_doc/assets/*.svg)) \
+	assets/gen_tcp_dist.erl assets/erl_uds_dist.erl assets/time_compat.erl
 
 # ----------------------------------------------------
 # Release Target
diff --git a/erts/emulator/internal_doc/CheckIO.md b/erts/emulator/internal_doc/CheckIO.md
new file mode 100644
index 000000000000..4288dbc6fdd2
--- /dev/null
+++ b/erts/emulator/internal_doc/CheckIO.md
@@ -0,0 +1,162 @@
+<!--
+%CopyrightBegin%
+
+Copyright Ericsson AB 2025. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+%CopyrightEnd%
+-->
+
+# Checking for I/O events
+
+An I/O event in ERTS is any event triggered by a [file descriptor]
+on Unix or any OBJECT HANDLE that can be passed to [WaitForMultipleObjects] on Windows.
+The check I/O infrastructure is used by linked-in drivers through [driver_select](erl_driver.md#driver_select)
+and by NIFs through [enif_select](erl_nif.md#enif_select).
+
+The main user of the check I/O subsystem is network communication through
+`m:gen_tcp`, `m:gen_udp`, `m:gen_sctp` and `m:socket` on Unix (on Windows
+`m:socket` used its own internal check I/O implementation based on completion ports).
+It is also used by various other parts, such as when doing `os:cmd/1` or
+reading from the terminal.
+
+This document gives an overview of how the check I/O subsystem works.
+
+The check I/O subsystem consists of a platform specific ([erl_poll](#polling))
+and platform agnostic part ([check_io](#check-i-o)).
+
+[erl_poll] is the basic mechanisms for checking if any events have been signalled
+and allows waiting for a timeout if needed. The implementation of polling is very
+platform specific and lives in [erts/emulator/sys/common/erl_poll.c] for Unix and
+[erts/emulator/sys/win32/erl_poll.c] for Windows.
+
+[check_io](#check-i-o) is the cross-platform part of the check I/O subsystem
+that makes sure that [erl_poll] has the correct state and dispatches events to
+the correct entity. The implementation can be found in [erts/emulator/sys/common/erl_check_io.c].
+
+check_io is then used by ports and NIFs to listen to events. Ports are
+communicated to through [port signals](PortSignals.md) and are delivered through
+the [ready_input](driver_entry.md#ready_input) and [ready_output](driver_entry.md#ready_output) callbacks.
+[NIFs](erl_nif.md) get an Erlang message whenever an event is triggered.
+
+[file descriptor]: https://en.wikipedia.org/wiki/File_descriptor
+[erl_poll]: #polling
+[erts/emulator/sys/common/erl_poll.c]: https://github.com/erlang/otp/blob/master/erts/emulator/sys/common/erl_poll.c
+[erts/emulator/sys/win32/erl_poll.c]: https://github.com/erlang/otp/blob/master/erts/emulator/sys/win32/erl_poll.c
+[erts/emulator/sys/common/erl_check_io.c]: https://github.com/erlang/otp/blob/master/erts/emulator/sys/common/erl_check_io.c
+[erts/emulator/beam/erl_port_task.c]: https://github.com/erlang/otp/blob/master/erts/emulator/beam/erl_port_task.c
+[erl_check_io.c]: https://github.com/erlang/otp/blob/master/erts/emulator/sys/common/erl_check_io.c
+[erts/emulator/beam/erl_port_task.c]: https://github.com/erlang/otp/blob/master/erts/emulator/beam/erl_port_task.c
+[WaitForMultipleObjects]: https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-waitformultipleobjects
+
+## Polling
+
+The polling subsystem basically has two API functions; erts_poll_control and
+erts_poll_wait. erts_poll_control is used to update a ErtsPollSet and
+erts_poll_wait is used to wait for an event in the pollset to be triggered or a
+timeout to happen (the timeout can be 0 if just checking). Only a single thread
+usually calls erts_poll_wait at the same time, but multiple threads calls
+erts_poll_control at any time.
+
+The implementation of Unix and Windows are completely different as Windows does
+not really have a concept of "polling" for an event.
+
+### Polling on Unix
+
+The poll implemention on Unix support a variety of different polling mechanisms.
+At the writing of this document they are: [epoll] (Linux), [kqueue] (MacOS + *Bsd),
+[/dev/poll] (Solaris), [poll] and [select]. [epoll]+[kqueue]+[/dev/poll] are
+referred to as "kernel polling" methods, as the information about which FDs are currently monitored
+lives in the OS kernel, while [poll]+[select] are "user polling" methods as the
+caller needs to supply all FDs of interest to the kernel everything erts_poll_wait
+is called.
+
+By default all Unix'es use a "kernel polling" method, but has a fallback pollset that
+uses "user polling" for FDs that the "kernel polling" mechanism does not
+support (for example the stdin FD on Linux cannot be monitored by [epoll]).
+
+As the kernel polling methods have their monitoring information in the kernel
+it is possible to update these in parallel and without waking the thread that
+is currently waiting for events. For user polling a queue of needed updates
+is managed by each pollset and whenever an update is done to that queue the
+thread waiting on events is woken up to update the set of file descriptors it
+is waiting on.
+
+When using kernel polling it is possible to have multiple poll threads
+(using the [+IOt](erl_cmd.md#+IOt) flag) that read events from the same pollset.
+This can be useful for very busy systems with many many FDs that become active alot.
+If the kernel you are using is not very good at allowing multiple threads to
+check in the same pollset (this primarily applied to old versions of Linux),
+then it is also possible to configure erts to use separate
+pollsets for each pollthread ((using the [+IOp](erl_cmd.md#+IOp) flag)).
+
+When an event is triggered it is removed from the pollset and needs to be
+re-enabled before any new events are triggered. If [ONESHOT] or equivalent is
+available then kernel polling uses that flag, otherwise erl_poll will update
+the pollset as the event is triggered.
+
+[epoll]: https://man7.org/linux/man-pages/man7/epoll.7.html
+[kqueue]: https://man.freebsd.org/cgi/man.cgi?kqueue
+[/dev/poll]: https://docs.oracle.com/cd/E88353_01/html/E37851/poll-4d.html
+[poll]: https://man7.org/linux/man-pages/man2/poll.2.html
+[select]: https://man7.org/linux/man-pages/man2/select.2.html
+[ONESHOT]: https://man7.org/linux/man-pages/man2/epoll_ctl.2.html#:~:text=EPOLLONESHOT
+
+### Polling on Windows
+
+Polling on Windows uses similar mechanism to "user polling" on Unix, except
+that because WaitForMultipleObjects is limited to wait for 64 handles it
+also needs to manage a thread pool. New threads are created as needed, so
+if the system only ever listens for events on less then 64 handles only 1
+thread will be created, but as the number of concurrent handles grow more
+and more threads will be created.
+
+The thread pool is never shrunk, so if the system at any point uses 1000
+handles, there will forever be 16 threads in the thread pool.
+
+## Check I/O
+
+Checking for I/O is done by dedicated polling threads. By default, one
+thread will always be waiting for I/O events using default polling method
+and the "aux" thread will be waiting in the fallback pollset if such exists.
+
+When an event is triggered it is dispatched to the correct port or process
+depending on whether it is a driver or nif that has requested the event.
+As the pollsets use [ONESHOT], the event is disabled until the port/NIF
+registers a new interest in the event.
+
+When you do a driver_select in a linked-in driver, that select will
+be active until it is disabled. Because of this we need to insert the
+FD back into the pollset when a driver_select event has been handled.
+This is done by the port re-inserting the FD in the pollset after
+a ready_input/ready_output event is called. For NIFs you need to call
+enif_select for each event that you want, so no such mechanism needs
+to exist for NIFs.
+
+### Scheduler pollset
+
+For very active FDs the fact that we need to re-insert events each time
+they trigger can lead to quite a lot of overhead. Because of this there
+is an optimization that places FDs that are never deselected into a
+special pollset managed that is not checked by the poll threads, but
+instead checked by the normal schedulers. In this pollset, the FDs no
+longer use the [ONESHOT] mechanism, instead they trigger as soon as there
+is data. For this to work, and not re-trigger on FDs before the port/nif has
+handled the event, there is a global counter called erts_port_task_outstanding_io_tasks
+that is incremented for each FD that is dispatched from the scheduler pollset.
+That counter is then decremented as the FDs are handled by the ports/processes
+that have subscribed to the event. When it reaches 0, we know that it is
+safe to check for new events. This increases the latency for how quickly
+we check for new events by a bit, but drastically reduces the CPU usage
+for very active FDs.
\ No newline at end of file

From 632912e7dd7b83212fa7616b38ca6de4b7b392cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lukas=20Backstr=C3=B6m?= <lukas@erlang.org>
Date: Fri, 6 Dec 2024 12:22:57 +0100
Subject: [PATCH 3/5] erts: Refactor check_io to simplify state flags handling

---
 erts/emulator/beam/erl_port.h           |   2 +-
 erts/emulator/beam/erl_port_task.c      |   5 +
 erts/emulator/sys/common/erl_check_io.c | 219 ++++++++++++++++--------
 3 files changed, 155 insertions(+), 71 deletions(-)

diff --git a/erts/emulator/beam/erl_port.h b/erts/emulator/beam/erl_port.h
index ea8a488ea7b1..499fce2fe432 100644
--- a/erts/emulator/beam/erl_port.h
+++ b/erts/emulator/beam/erl_port.h
@@ -334,7 +334,7 @@ Eterm erts_request_io_bytes(Process *c_p);
 #define ERTS_PORT_SFLG_INVALID		((Uint32) (1 << 11))
 /* Last port to terminate halts the emulator */
 #define ERTS_PORT_SFLG_HALT		((Uint32) (1 << 12))
-/* Check if the event in ready_input should be cleaned */
+/* Check if the event in ready_input should be cleaned. */
 #define ERTS_PORT_SFLG_CHECK_FD_CLEANUP ((Uint32) (1 << 13))
 #ifdef DEBUG
 /* Only debug: make sure all flags aren't cleared unintentionally */
diff --git a/erts/emulator/beam/erl_port_task.c b/erts/emulator/beam/erl_port_task.c
index 923f6a324b0c..239024106ffe 100644
--- a/erts/emulator/beam/erl_port_task.c
+++ b/erts/emulator/beam/erl_port_task.c
@@ -595,6 +595,11 @@ reset_executed_io_task_handle(Port *prt, ErtsPortTask *ptp)
                                                   reset_port_task_handle);
                 erts_atomic32_read_band_nob(&prt->state, ~ERTS_PORT_SFLG_CHECK_FD_CLEANUP);
             } else {
+                /* We don't have to call erts_io_notify_port_task_executed for scheduler events
+                   as we will keep the fd in the set. However, if driver_deselect was called in
+                   the ready_input callback, then we do need to call it in order to free the
+                   select structures from the fd state.
+                   */
                 reset_port_task_handle(ptp->u.alive.handle);
             }
         } else
diff --git a/erts/emulator/sys/common/erl_check_io.c b/erts/emulator/sys/common/erl_check_io.c
index f821cca9c854..67f9978e845d 100644
--- a/erts/emulator/sys/common/erl_check_io.c
+++ b/erts/emulator/sys/common/erl_check_io.c
@@ -45,12 +45,16 @@
 
 #if 0
 #define DEBUG_PRINT(FMT, ...) do { erts_printf(FMT "\r\n", ##__VA_ARGS__); fflush(stdout); } while(0)
-#define DEBUG_PRINT_FD(FMT, STATE, ...)                                 \
-    DEBUG_PRINT("%d: " FMT " (ev=%s, ac=%s, flg=%s)",                   \
-                (STATE) ? (STATE)->fd : (ErtsSysFdType)-1, ##__VA_ARGS__, \
-                ev2str((STATE) ? (STATE)->events : ERTS_POLL_EV_NONE),  \
-                ev2str((STATE) ? (STATE)->active_events : ERTS_POLL_EV_NONE), \
-                (STATE) ? flag2str((STATE)->flags) : ERTS_EV_FLAG_CLEAR)
+#define DEBUG_PRINT_FD(FMT, STATE, ...)                                                 \
+    do {                                                                                \
+        const char buff[128];                                                           \
+        DEBUG_PRINT("%d: " FMT " (ev=%s, ac=%s, flg=%s)",                               \
+                (STATE) ? (STATE)->fd : (ErtsSysFdType)-1, ##__VA_ARGS__,               \
+                ev2str((STATE) ? (STATE)->events : ERTS_POLL_EV_NONE),                  \
+                ev2str((STATE) ? (STATE)->active_events : ERTS_POLL_EV_NONE),           \
+                event_state_flag_to_str((STATE) ? (STATE)->flags : ERTS_EV_FLAG_CLEAR,  \
+                            buff, sizeof(buff)));                                       \
+    } while(0)
 #define DEBUG_PRINT_MODE
 #else
 #define DEBUG_PRINT(...)
@@ -74,20 +78,20 @@ typedef enum {
 } EventStateType;
 
 typedef enum {
-    ERTS_EV_FLAG_CLEAR         = 0,
-    ERTS_EV_FLAG_USED          = 1,   /* ERL_DRV_USE has been turned on */
+    ERTS_EV_FLAG_CLEAR         = 0x0,
+    ERTS_EV_FLAG_USED          = 0x1,   /* ERL_DRV_USE has been turned on */
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-    ERTS_EV_FLAG_SCHEDULER     = 2,   /* Set when the fd has been migrated
-                                         to scheduler pollset */
-    ERTS_EV_FLAG_IN_SCHEDULER  = 4,   /* Set when the fd is currently in
-                                         scheduler pollset */
+    ERTS_EV_FLAG_SCHEDULER     = 0x2,   /* Set when the fd has been migrated
+                                           to scheduler pollset */
+    ERTS_EV_FLAG_IN_SCHEDULER  = 0x4,   /* Set when the fd is currently in
+                                           scheduler pollset */
 #else
     ERTS_EV_FLAG_SCHEDULER     = ERTS_EV_FLAG_CLEAR,
     ERTS_EV_FLAG_IN_SCHEDULER  = ERTS_EV_FLAG_CLEAR,
 #endif
 #ifdef ERTS_POLL_USE_FALLBACK
-    ERTS_EV_FLAG_FALLBACK      = 8,   /* Set when kernel poll rejected fd
-                                         and it was put in the nkp version */
+    ERTS_EV_FLAG_FALLBACK      = 0x8,   /* Set when kernel poll rejected fd
+                                           and it was put in the nkp version */
 #else
     ERTS_EV_FLAG_FALLBACK      = ERTS_EV_FLAG_CLEAR,
 #endif
@@ -102,7 +106,7 @@ typedef enum {
 } EventStateFlags;
 
 
-static const char* event_state_flag_to_str(EventStateFlags f)
+static const char* event_state_flag_to_str(EventStateFlags f, const char *buff, int len)
 {
     switch ((int)f) {
     case ERTS_EV_FLAG_CLEAR: return "CLEAR";
@@ -117,7 +121,9 @@ static const char* event_state_flag_to_str(EventStateFlags f)
     case ERTS_EV_FLAG_SCHEDULER | ERTS_EV_FLAG_IN_SCHEDULER
         | ERTS_EV_FLAG_USED: return "USED|IN_SCHD";
 #endif
-    default: return "ERROR";
+    default:
+        snprintf((char *)buff, len, "ERROR(%d)", f);
+        return buff;
     }
 }
 
@@ -162,8 +168,18 @@ typedef struct {
             ErtsResource* resource;   /* ERTS_EV_TYPE_STOP_NIF */
         } stop;
     } driver;
-    ErtsPollEvents events;        /* The events that have been selected upon */
-    ErtsPollEvents active_events; /* The events currently active in the pollset */
+    /* The events that have been selected upon, that is the events that
+     * driver_select/enif_select has been issued for. */
+    ErtsPollEvents events;
+    /* The events currently active in the pollset. This can be both fewer and more
+     * events than what are in the `events` fields, but normally they are the same.
+     *
+     * The scenarios when they differ are:
+     *   - When a driver_select has triggered, but ready_input has not yet been called.
+     *     In this scenario `events` will have the event, but it will not be active
+     *     until erts_io_notify_port_task_executed has been called.
+     */
+    ErtsPollEvents active_events;
     EventStateType type;
     EventStateFlags flags;
     int count;                    /* Number of times this fd has triggered
@@ -414,19 +430,23 @@ get_fallback_pollset(void)
 }
 #endif
 
+#if ERTS_POLL_USE_SCHEDULER_POLLING
 static ERTS_INLINE ErtsPollSet *
-get_scheduler_pollset(ErtsSysFdType fd)
+get_scheduler_pollset(void)
 {
-#if ERTS_POLL_USE_SCHEDULER_POLLING
-    return sched_pollset ? sched_pollset : get_pollset(fd);
-#else
-    return get_pollset(fd);
-#endif
+    return sched_pollset;
 }
+#endif
 
 /*
  * Place a fd within a pollset. This will automatically use
- * the fallback ps if needed.
+ * the fallback ps if needed and update the scheduler pollset
+ * if used.
+ * 
+ * Note that for simplicity this function does not add a new fd to
+ * the scheduler pollset. That is done by the functions that manage
+ * the pollset migration.
+ * 
  */
 static ERTS_INLINE ErtsPollEvents
 erts_io_control_wakeup(ErtsDrvEventState *state, ErtsPollOp op,
@@ -440,16 +460,36 @@ erts_io_control_wakeup(ErtsDrvEventState *state, ErtsPollOp op,
 
     if (!(flags & ERTS_EV_FLAG_FALLBACK)) {
 
-        if (op == ERTS_POLL_OP_DEL && (flags & ERTS_EV_FLAG_SCHEDULER)) {
-            erts_poll_control(get_scheduler_pollset(fd), fd, op, pe, wake_poller);
-            flags &= ~ERTS_EV_FLAG_IN_SCHEDULER;
-        }
-        if (!(flags & ERTS_EV_FLAG_IN_SCHEDULER) || (pe & ERTS_POLL_EV_OUT)) {
-            res = erts_poll_control(get_pollset(fd), fd, op, pe, wake_poller);
-        } else {
-            res = erts_poll_control(get_scheduler_pollset(fd), fd, op, pe, wake_poller);
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+        if (flags & ERTS_EV_FLAG_SCHEDULER) {
+            ASSERT(op != ERTS_POLL_OP_ADD);
+            if (op == ERTS_POLL_OP_MOD && (flags & ERTS_EV_FLAG_IN_SCHEDULER) && (pe & ERTS_POLL_EV_IN)) {
+                /* The FD already is enabled in scheduler pollset and we are trying to re-insert */
+                res |= ERTS_POLL_EV_IN;
+            } else if (op == ERTS_POLL_OP_MOD && !(flags & ERTS_EV_FLAG_IN_SCHEDULER) && !(pe & ERTS_POLL_EV_IN)) {
+                /* The FD already is disabled in scheduler pollset and we are trying to disable it */
+                res |= 0;
+            } else {
+                res |= erts_poll_control(get_scheduler_pollset(), fd, op, pe & ERTS_POLL_EV_IN, wake_poller);
+                if (op == ERTS_POLL_OP_DEL) {
+                    state->flags &= ~(ERTS_EV_FLAG_SCHEDULER|ERTS_EV_FLAG_IN_SCHEDULER);
+                } else if (pe & ERTS_POLL_EV_IN) {
+                    state->flags |= ERTS_EV_FLAG_IN_SCHEDULER;
+                } else {
+                    state->flags &= ~ERTS_EV_FLAG_IN_SCHEDULER;
+                }
+            }
+            pe &= ~ERTS_POLL_EV_IN;
+            if (state->flags & ERTS_EV_FLAG_IN_SCHEDULER) {
+                ERTS_ASSERT(state->active_events & ERTS_POLL_EV_IN);
+            } else {
+                ERTS_ASSERT(!(state->active_events & ERTS_POLL_EV_IN));
+            }
         }
+#endif
 
+        res |= erts_poll_control(get_pollset(fd), fd, op, pe, wake_poller);
+        
 #if ERTS_POLL_USE_FALLBACK
         if (op == ERTS_POLL_OP_ADD && res == ERTS_POLL_EV_NVAL) {
             /* When an add fails with NVAL, the poll/kevent operation could not
@@ -482,7 +522,6 @@ erts_io_notify_port_task_executed(ErtsPortTaskType type,
     ErtsIoTask *itp = ErtsContainerStruct(pthp, ErtsIoTask, task);
     ErtsSysFdType fd = itp->fd;
     erts_mtx_t *mtx = fd_mtx(fd);
-    ErtsPollOp op = ERTS_POLL_OP_MOD;
     int active_events, new_events = 0;
     ErtsDrvEventState *state;
     ErtsDrvSelectDataState *free_select = NULL;
@@ -503,21 +542,29 @@ erts_io_notify_port_task_executed(ErtsPortTaskType type,
 
             DEBUG_PRINT_FD("executed ready_input", state);
 
-            if (!(state->flags & ERTS_EV_FLAG_IN_SCHEDULER)
-                && !(active_events & ERTS_POLL_EV_IN)
+            if (!(active_events & ERTS_POLL_EV_IN)
                 && (state->events & ERTS_POLL_EV_IN)) {
 
+                ASSERT(!(state->flags & ERTS_EV_FLAG_SCHEDULER) &&
+                    "active_events should always have ERTS_POLL_EV_IN if we are in scheduler");
+
                 active_events |= ERTS_POLL_EV_IN;
-                if (state->count > 10 && erts_sched_poll_enabled()) {
-                    if (!(state->flags & ERTS_EV_FLAG_SCHEDULER))
-                        op = ERTS_POLL_OP_ADD;
-                    state->flags |= ERTS_EV_FLAG_IN_SCHEDULER|ERTS_EV_FLAG_SCHEDULER;
-                    new_events = ERTS_POLL_EV_IN;
+
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+                if (erts_sched_poll_enabled() &&
+                    !(state->flags & ERTS_EV_FLAG_FALLBACK) &&
+                    state->count++ > 10) {
+                    int wake_poller = 0;
                     DEBUG_PRINT_FD("moving to scheduler ps", state);
+                    new_events = erts_poll_control(get_scheduler_pollset(), fd, ERTS_POLL_OP_ADD,
+                        ERTS_POLL_EV_IN, &wake_poller);
+                    state->flags |= ERTS_EV_FLAG_SCHEDULER|ERTS_EV_FLAG_IN_SCHEDULER;
+                    state->active_events = active_events;
                 } else
+#endif
+                {
                     new_events = active_events;
-                if (!(state->flags & ERTS_EV_FLAG_FALLBACK) && erts_sched_poll_enabled())
-                    state->count++;
+                }
             }
             break;
         case ERTS_PORT_TASK_OUTPUT:
@@ -528,10 +575,8 @@ erts_io_notify_port_task_executed(ErtsPortTaskType type,
                 && (state->events & ERTS_POLL_EV_OUT)) {
 
                 active_events |= ERTS_POLL_EV_OUT;
-                if (state->flags & ERTS_EV_FLAG_IN_SCHEDULER && active_events & ERTS_POLL_EV_IN)
-                    new_events = ERTS_POLL_EV_OUT;
-                else
-                    new_events = active_events;
+
+                new_events = active_events;
             }
             break;
         default:
@@ -542,7 +587,7 @@ erts_io_notify_port_task_executed(ErtsPortTaskType type,
         if (state->active_events != active_events) {
             ASSERT(new_events);
             state->active_events = active_events;
-            new_events = erts_io_control(state, op, new_events);
+            new_events = erts_io_control(state, ERTS_POLL_OP_MOD, new_events);
         }
 
         /* We were unable to re-insert the fd into the pollset, signal the callback. */
@@ -553,6 +598,13 @@ erts_io_notify_port_task_executed(ErtsPortTaskType type,
                 oready(state->driver.select->outport, state);
             state->active_events = 0;
             active_events = 0;
+
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            /* The error has only happened for scheduler or normal pollset,
+               so when cleaning up we should also delete it from the other pollset.
+            */
+            erts_io_control(state, ERTS_POLL_OP_DEL, 0);
+#endif
         }
     }
 
@@ -747,7 +799,7 @@ deselect(ErtsDrvEventState *state, int mode)
 	    break;
 	}
 	state->type = ERTS_EV_TYPE_NONE;
-	state->flags = 0;
+	state->flags = ERTS_EV_FLAG_CLEAR;
     } else {
         ErtsPollEvents new_events =
             erts_io_control(state, ERTS_POLL_OP_MOD, state->active_events);
@@ -794,7 +846,7 @@ check_fd_cleanup(ErtsDrvEventState *state,
     if (((state->type != ERTS_EV_TYPE_NONE)
          | (state->driver.nif != NULL)
 	 | (state->driver.select != NULL)) == 0) {
-
+        state->flags = ERTS_EV_FLAG_CLEAR;
 	erase_drv_ev_state(state);
     }
 }
@@ -819,6 +871,7 @@ driver_select(ErlDrvPort ix, ErlDrvEvent e, int mode, int on)
     ErtsDrvEventState *state;
     int wake_poller = 0;
     int ret;
+    int is_scheduler;
     ErtsDrvSelectDataState *free_select = NULL;
     ErtsNifSelectDataState *free_nif = NULL;
 #ifdef USE_VM_PROBES
@@ -920,6 +973,8 @@ driver_select(ErlDrvPort ix, ErlDrvEvent e, int mode, int on)
 
     old_events = state->events;
 
+    is_scheduler = state->flags & ERTS_EV_FLAG_SCHEDULER;
+
     if (on) {
         ctl_events &= ~old_events;
         state->events |= ctl_events;
@@ -931,8 +986,6 @@ driver_select(ErlDrvPort ix, ErlDrvEvent e, int mode, int on)
             ctl_op = ERTS_POLL_OP_ADD;
         }
         new_events = state->active_events;
-        if (state->flags & ERTS_EV_FLAG_IN_SCHEDULER)
-            new_events &= ~ERTS_POLL_EV_IN;
     }
     else {
         ctl_events &= old_events;
@@ -940,13 +993,6 @@ driver_select(ErlDrvPort ix, ErlDrvEvent e, int mode, int on)
         state->active_events &= ~ctl_events;
         new_events = state->active_events;
 
-        if (ctl_events & ERTS_POLL_EV_IN) {
-            state->count = 0;
-            if (state->flags & ERTS_EV_FLAG_IN_SCHEDULER) {
-                new_events = 0;
-            }
-        }
-
         if (!state->events) {
             if (!(state->flags & ERTS_EV_FLAG_USED) || mode & ERL_DRV_USE)
                 ctl_op = ERTS_POLL_OP_DEL;
@@ -988,7 +1034,7 @@ driver_select(ErlDrvPort ix, ErlDrvEvent e, int mode, int on)
             if (ctl_events & ERTS_POLL_EV_IN) {
                 abort_tasks(state, ERL_DRV_READ);
                 state->driver.select->inport = NIL;
-                state->flags &= ~ERTS_EV_FLAG_IN_SCHEDULER;
+                ASSERT(!(state->flags & ERTS_EV_FLAG_IN_SCHEDULER));
             }
             if (ctl_events & ERTS_POLL_EV_OUT) {
                 abort_tasks(state, ERL_DRV_WRITE);
@@ -997,9 +1043,9 @@ driver_select(ErlDrvPort ix, ErlDrvEvent e, int mode, int on)
             if (state->events == 0) {
                 if ((mode & ERL_DRV_USE) || !(state->flags & ERTS_EV_FLAG_USED)) {
                     state->type = ERTS_EV_TYPE_NONE;
-                    if (state->flags & ERTS_EV_FLAG_SCHEDULER)
+                    if (is_scheduler)
                         erts_atomic32_read_bor_nob(&prt->state, ERTS_PORT_SFLG_CHECK_FD_CLEANUP);
-                    state->flags = 0;
+                    state->flags &= ERTS_EV_FLAG_NIF_SELECT;
                 }
                 /*else keep it, as fd will probably be selected upon again */
             }
@@ -1793,10 +1839,16 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
             revents = state->active_events;
             state->active_events = 0;
 
-            if (state->flags & ERTS_EV_FLAG_IN_SCHEDULER) {
-                erts_io_control(state, ERTS_POLL_OP_MOD, 0);
-                state->flags &= ~ERTS_EV_FLAG_IN_SCHEDULER;
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            if (psi->ps == get_scheduler_pollset() && state->flags & ERTS_EV_FLAG_SCHEDULER) {
+                /* In the poll thread, this fd would have been disabled due to ONESHOT,
+                   but in the scheduler pollset it needs to be disabled. */
+                int wake_poller = 0;
+                erts_poll_control(psi->ps, fd, ERTS_POLL_OP_DEL, 0, &wake_poller);
+                state->flags &= ~(ERTS_EV_FLAG_SCHEDULER|ERTS_EV_FLAG_IN_SCHEDULER);
+                state->count = 0;
             }
+#endif
         } else {
 
             /* Disregard any events that are not active at the moment,
@@ -1804,14 +1856,27 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
                select/deselect in rapid succession. */
             revents &= state->active_events | ERTS_POLL_EV_NVAL;
 
-            if (psi->ps != get_scheduler_pollset(fd) || !erts_sched_poll_enabled()) {
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            if (psi->ps == get_scheduler_pollset()) {
+                if (!(state->events & ERTS_POLL_EV_IN) && state->flags & ERTS_EV_FLAG_SCHEDULER) {
+                    /* If we triggered in a scheduler pollset and EV_IN is not set,
+                       then we should just remove it from the scheduler pollset.
+                    */
+                    int wake_poller = 0;
+                    erts_poll_control(psi->ps, fd, ERTS_POLL_OP_DEL, 0, &wake_poller);
+                    state->flags &= ~(ERTS_EV_FLAG_IN_SCHEDULER|ERTS_EV_FLAG_SCHEDULER);
+                    state->active_events &= ~ERTS_POLL_EV_IN;
+                    state->count = 0;
+                }
+            } else
+#endif
+            {
                 ErtsPollEvents reactive_events;
                 state->active_events &= ~revents;
 
                 reactive_events = state->active_events;
 
                 if (state->flags & ERTS_EV_FLAG_IN_SCHEDULER) {
-                    reactive_events &= ~ERTS_POLL_EV_IN;
                     state->active_events |= ERTS_POLL_EV_IN;
                 }
 
@@ -1914,6 +1979,10 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
         }
 
         case ERTS_EV_TYPE_STOP_USE: {
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            if (psi->ps == get_scheduler_pollset())
+                break;
+#endif
 #if ERTS_POLL_USE_FALLBACK
             ASSERT(psi->ps == get_fallback_pollset());
 #endif
@@ -1963,7 +2032,7 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
 
     /* The entire pollres array was filled with events,
      * grow it for the next call. We do this for two reasons:
-     * 1. Pulling out more events in on go will increase throughput
+     * 1. Pulling out more events in one go will increase throughput
      * 2. If the polling implementation is not fair, this will make
      *    sure that we get all fds that we can. i.e. if 12 fds are
      *    constantly active, but we only have a pollres_len of 10,
@@ -2293,13 +2362,15 @@ erts_init_check_io(int *argc, char **argv)
     psiv++;
 #endif
 
+#if ERTS_POLL_USE_SCHEDULER_POLLING
     if (erts_sched_poll_enabled()) {
         psiv[0].pollres_len = ERTS_CHECK_IO_POLL_RES_LEN;
         psiv[0].pollres = erts_alloc(ERTS_ALC_T_POLLSET,
                                      sizeof(ErtsPollResFd) * ERTS_CHECK_IO_POLL_RES_LEN);
-        psiv[0].ps = get_scheduler_pollset(0);
+        psiv[0].ps = get_scheduler_pollset();
         psiv++;
     }
+#endif
 
     for (j = 0; j < erts_no_poll_threads; j++) {
         psiv[j].pollres_len = ERTS_CHECK_IO_POLL_RES_LEN;
@@ -2564,11 +2635,12 @@ print_events(erts_dsprintf_buf_t *dsbufp, ErtsPollEvents ev)
 static ERTS_INLINE void
 print_flags(erts_dsprintf_buf_t *dsbufp, EventStateFlags f)
 {
+    const char buff[64];
     if (f & ERTS_EV_FLAG_WANT_ERROR) {
         erts_dsprintf(dsbufp, "WANTERR|");
         f &= ~ERTS_EV_FLAG_WANT_ERROR;
     }
-    erts_dsprintf(dsbufp, "%s", event_state_flag_to_str(f));
+    erts_dsprintf(dsbufp, "%s", event_state_flag_to_str(f, buff, sizeof(buff)));
 }
 
 #ifdef DEBUG_PRINT_MODE
@@ -2603,6 +2675,10 @@ nifmode2str(enum ErlNifSelectFlags mode) {
     case ERL_NIF_SELECT_CANCEL|ERL_NIF_SELECT_WRITE: return "CANCEL|WRITE";
     case ERL_NIF_SELECT_CANCEL|ERL_NIF_SELECT_READ|ERL_NIF_SELECT_WRITE:
         return "CANCEL|READ|WRITE";
+    case ERL_NIF_SELECT_CUSTOM_MSG|ERL_NIF_SELECT_READ: return "CUSTOM|READ";
+    case ERL_NIF_SELECT_CUSTOM_MSG|ERL_NIF_SELECT_WRITE: return "CUSTOM|WRITE";
+    case ERL_NIF_SELECT_CUSTOM_MSG|ERL_NIF_SELECT_READ|ERL_NIF_SELECT_WRITE:
+        return "CUSTOM|READ|WRITE";
     default: return "UNKNOWN";
     }
 }
@@ -2892,6 +2968,9 @@ static void doit_erts_check_io_debug(void *vstate, void *vcounters,
 	if (erts_debug_print_checkio_state(dsbufp, state, ep_events, internal)) {
 	    counters->num_errors++;
 	}
+    } else {
+        if (state->driver.select || state->driver.nif)
+            erts_debug_print_checkio_state(dsbufp, state, ep_events, internal);
     }
 }
 

From 55c4436988e572442f949a552a3193cf42e09302 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lukas=20Backstr=C3=B6m?= <lukas@erlang.org>
Date: Fri, 6 Dec 2024 12:25:10 +0100
Subject: [PATCH 4/5] erts: Add scheduler polling support to nifs

---
 erts/emulator/beam/erl_nif.c                  |  23 +-
 erts/emulator/beam/erl_port_task.h            |  15 +
 erts/emulator/beam/erl_proc_sig_queue.c       |  32 ++-
 erts/emulator/beam/erl_proc_sig_queue.h       |   8 +-
 erts/emulator/beam/erl_process.c              |  19 +-
 erts/emulator/beam/erl_process.h              |   6 +
 erts/emulator/beam/global.h                   |   2 +-
 erts/emulator/sys/common/erl_check_io.c       | 262 ++++++++++++++----
 erts/emulator/sys/common/erl_check_io.h       |  17 ++
 erts/emulator/test/driver_SUITE.erl           |   1 +
 erts/emulator/test/nif_SUITE.erl              | 183 +++++++++++-
 erts/emulator/test/nif_SUITE_data/nif_SUITE.c |  41 ++-
 12 files changed, 539 insertions(+), 70 deletions(-)

diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index c6d00f312d81..6e05b4cb8782 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -810,20 +810,27 @@ int erts_flush_trace_messages(Process *c_p, ErtsProcLocks c_p_locks)
 /** @brief Create a message with the content of process independent \c msg_env.
  *  Invalidates \c msg_env.
  */
-ErtsMessage* erts_create_message_from_nif_env(ErlNifEnv* msg_env)
+ErtsMessage* erts_create_message_from_nif_env(ErlNifEnv* msg_env, Uint extra)
 {
     struct enif_msg_environment_t* menv = (struct enif_msg_environment_t*)msg_env;
     ErtsMessage* mp;
+    ErlHeapFragment *heap_frag;
 
     flush_env(msg_env);
-    mp = erts_alloc_message(0, NULL);
-    mp->data.heap_frag = menv->env.heap_frag;
-    ASSERT(mp->data.heap_frag == MBUF(&menv->phony_proc));
-    if (mp->data.heap_frag != NULL) {
+    mp = erts_alloc_message(extra, NULL);
+    if (extra) {
+        mp->hfrag.next = menv->env.heap_frag;
+        heap_frag = mp->hfrag.next;
+    } else {
+        mp->data.heap_frag = menv->env.heap_frag;
+        heap_frag = mp->data.heap_frag;
+    }
+    ASSERT(heap_frag == MBUF(&menv->phony_proc));
+    if (heap_frag != NULL) {
         /* Move all offheap's from phony proc to the first fragment.
            Quick and dirty... */
-        ASSERT(!is_offheap(&mp->data.heap_frag->off_heap));
-        mp->data.heap_frag->off_heap = MSO(&menv->phony_proc);
+        ASSERT(!is_offheap(&heap_frag->off_heap));
+        heap_frag->off_heap = MSO(&menv->phony_proc);
         clear_offheap(&MSO(&menv->phony_proc));
         menv->env.heap_frag = NULL;
         MBUF(&menv->phony_proc) = NULL;
@@ -944,7 +951,7 @@ int enif_send(ErlNifEnv* env, const ErlNifPid* to_pid,
             }
 #endif
         }
-        mp = erts_create_message_from_nif_env(msg_env);
+        mp = erts_create_message_from_nif_env(msg_env, 0);
         ERL_MESSAGE_TOKEN(mp) = token;
     } else {
         erts_literal_area_t litarea;
diff --git a/erts/emulator/beam/erl_port_task.h b/erts/emulator/beam/erl_port_task.h
index dfc9b4416ce8..7fbf615c4a9d 100644
--- a/erts/emulator/beam/erl_port_task.h
+++ b/erts/emulator/beam/erl_port_task.h
@@ -44,6 +44,7 @@ typedef erts_atomic_t ErtsPortTaskHandle;
 #if (defined(ERL_PROCESS_C__) \
      || defined(ERL_PORT_TASK_C__) \
      || defined(ERL_IO_C__) \
+     || defined(ERL_CHECK_IO_C__) \
      || (ERTS_GLB_INLINE_INCL_FUNC_DEF \
 	 && defined(ERTS_DO_INCL_GLB_INLINE_FUNC_DEF)))
 #define ERTS_INCLUDE_SCHEDULER_INTERNALS
@@ -138,6 +139,8 @@ ERTS_GLB_INLINE void erts_port_task_sched_enter_exiting_state(ErtsPortTaskSched
 
 #if defined(ERTS_INCLUDE_SCHEDULER_INTERNALS) && ERTS_POLL_USE_SCHEDULER_POLLING
 ERTS_GLB_INLINE int erts_port_task_have_outstanding_io_tasks(void);
+ERTS_GLB_INLINE void erts_port_task_inc_outstanding_io_tasks(void);
+ERTS_GLB_INLINE void erts_port_task_dec_outstanding_io_tasks(void);
 /* NOTE: Do not access any of the exported variables directly */
 extern erts_atomic_t erts_port_task_outstanding_io_tasks;
 #endif
@@ -226,6 +229,18 @@ erts_port_task_have_outstanding_io_tasks(void)
     return (erts_atomic_read_acqb(&erts_port_task_outstanding_io_tasks)
 	    != 0);
 }
+
+ERTS_GLB_INLINE void
+erts_port_task_inc_outstanding_io_tasks(void)
+{
+    erts_atomic_inc_nob(&erts_port_task_outstanding_io_tasks);
+}
+
+ERTS_GLB_INLINE void
+erts_port_task_dec_outstanding_io_tasks(void)
+{
+    erts_atomic_dec_nob(&erts_port_task_outstanding_io_tasks);
+}
 #endif
 
 #endif
diff --git a/erts/emulator/beam/erl_proc_sig_queue.c b/erts/emulator/beam/erl_proc_sig_queue.c
index ea0fa38848bd..44dfcc5eb1af 100644
--- a/erts/emulator/beam/erl_proc_sig_queue.c
+++ b/erts/emulator/beam/erl_proc_sig_queue.c
@@ -42,6 +42,7 @@
 #include "bif.h"
 #include "erl_bif_unique.h"
 #include "erl_proc_sig_queue.h"
+#include "erl_check_io.h"
 #include "dtrace-wrapper.h"
 
 #define ERTS_SIG_REDS_CNT_FACTOR 4
@@ -2472,6 +2473,16 @@ erts_proc_sig_send_link(ErtsPTabElementCommon *sender, Eterm from,
     return proc_queue_signal(sender, from, to, sig, 0, ERTS_SIG_Q_OP_LINK);
 }
 
+int
+erts_proc_sig_send_nif_select(Eterm to, ErtsMessage *msg) {
+    ErtsSignal *sig = (ErtsSignal*)msg;
+
+    sig->common.tag = ERTS_PROC_SIG_MAKE_TAG(ERTS_SIG_Q_OP_NIF_SELECT,
+                                             ERTS_SIG_Q_TYPE_UNDEFINED,
+                                             0);
+    return proc_queue_signal(NULL, am_system, to, sig, 0, ERTS_SIG_Q_OP_NIF_SELECT);
+}
+
 ErtsSigUnlinkOp *
 erts_proc_sig_make_unlink_op(ErtsPTabElementCommon *sender, Eterm from)
 {
@@ -6355,6 +6366,20 @@ erts_proc_sig_handle_incoming(Process *c_p, erts_aint32_t *statep,
 
             break;
         }
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+        case ERTS_SIG_Q_OP_NIF_SELECT: {
+
+            Eterm msg = erts_io_handle_nif_select(sig);
+
+            convert_prepared_sig_to_msg(c_p, &tracing, sig, msg, am_undefined, next_nm_sig);
+
+            cnt += 4;
+
+            erts_proc_notify_new_message(c_p, ERTS_PROC_LOCK_MAIN);
+
+            break;
+        }
+#endif
 
         default:
             ERTS_INTERNAL_ERROR("Unknown signal");
@@ -6726,6 +6751,7 @@ erts_proc_sig_handle_exit(Process *c_p, Sint *redsp,
             }
             break;
 
+        case ERTS_SIG_Q_OP_NIF_SELECT:
         case ERTS_SIG_Q_OP_PERSISTENT_MON_MSG:
         case ERTS_SIG_Q_OP_ALIAS_MSG:
             sig->next = NULL;
@@ -6918,6 +6944,7 @@ clear_seq_trace_token(ErtsMessage *sig)
         case ERTS_SIG_Q_OP_RECV_MARK:
         case ERTS_SIG_Q_OP_ADJ_MSGQ:
 	case ERTS_SIG_Q_OP_FLUSH:
+        case ERTS_SIG_Q_OP_NIF_SELECT:
             break;
 
         default:
@@ -7007,7 +7034,9 @@ erts_proc_sig_signal_size(ErtsSignal *sig)
     case ERTS_SIG_Q_OP_SYNC_SUSPEND:
     case ERTS_SIG_Q_OP_PERSISTENT_MON_MSG:
     case ERTS_SIG_Q_OP_IS_ALIVE:
-    case ERTS_SIG_Q_OP_DIST_SPAWN_REPLY: {
+    case ERTS_SIG_Q_OP_DIST_SPAWN_REPLY:
+    case ERTS_SIG_Q_OP_NIF_SELECT:
+    {
         ErlHeapFragment *hf;
         size = sizeof(ErtsMessageRef);
         size += ERTS_HEAP_FRAG_SIZE(((ErtsMessage *) sig)->hfrag.alloc_size);
@@ -8614,6 +8643,7 @@ erts_proc_sig_debug_foreach_sig(Process *c_p,
                 case ERTS_SIG_Q_OP_RECV_MARK:
 		case ERTS_SIG_Q_OP_FLUSH:
                 case ERTS_SIG_Q_OP_RPC:
+                case ERTS_SIG_Q_OP_NIF_SELECT:
                     break;
 
                 default:
diff --git a/erts/emulator/beam/erl_proc_sig_queue.h b/erts/emulator/beam/erl_proc_sig_queue.h
index 6b4e22090b6e..cb0892cf54a5 100644
--- a/erts/emulator/beam/erl_proc_sig_queue.h
+++ b/erts/emulator/beam/erl_proc_sig_queue.h
@@ -169,7 +169,7 @@ typedef struct {
  * Note that not all signal are handled using this functionality!
  */
 
-#define ERTS_SIG_Q_OP_MAX 19
+#define ERTS_SIG_Q_OP_MAX 20
 
 #define ERTS_SIG_Q_OP_EXIT                      0  /* Exit signal due to bif call */
 #define ERTS_SIG_Q_OP_EXIT_LINKED               1  /* Exit signal due to link break*/
@@ -190,7 +190,8 @@ typedef struct {
 #define ERTS_SIG_Q_OP_RECV_MARK                 16
 #define ERTS_SIG_Q_OP_UNLINK_ACK                17
 #define ERTS_SIG_Q_OP_ADJ_MSGQ                  18
-#define ERTS_SIG_Q_OP_FLUSH			ERTS_SIG_Q_OP_MAX
+#define ERTS_SIG_Q_OP_FLUSH			19
+#define ERTS_SIG_Q_OP_NIF_SELECT		ERTS_SIG_Q_OP_MAX
 
 #define ERTS_SIG_Q_TYPE_MAX (ERTS_MON_LNK_TYPE_MAX + 10)
 
@@ -1178,6 +1179,9 @@ erts_proc_sig_send_cla_request(Process *c_p, Eterm to, Eterm req_id);
 void
 erts_proc_sig_send_move_msgq_off_heap(Eterm to);
 
+int
+erts_proc_sig_send_nif_select(Eterm to, ErtsMessage *msg);
+
 /*
  * End of send operations of currently supported process signals.
  */
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 230e4437cfd6..4de61cd00b75 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -3392,7 +3392,7 @@ try_set_sys_scheduling(void)
 
 
 static ERTS_INLINE int
-prepare_for_sys_schedule(void)
+prepare_for_sys_schedule(ErtsSchedulerData *esdp)
 {
     if (erts_sched_poll_enabled()) {
         while (!erts_port_task_have_outstanding_io_tasks()
@@ -3407,7 +3407,7 @@ prepare_for_sys_schedule(void)
 
 #else
 #define clear_sys_scheduling()
-#define prepare_for_sys_schedule() 0
+#define prepare_for_sys_schedule(esdp) 0
 #endif
 
 #ifdef HARDDEBUG
@@ -3542,13 +3542,18 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
                 current_time = 0;
                 timeout_time = ERTS_MONOTONIC_TIME_MAX;
             }
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            if (!ERTS_SCHEDULER_IS_DIRTY(esdp) && erts_sched_poll_enabled()) {
+                erts_io_clear_nif_select_handles(esdp);
+            }
+#endif
             if (do_timeout) {
                 if (!thr_prgr_active) {
                     erts_thr_progress_active(erts_thr_prgr_data(esdp), thr_prgr_active = 1);
                     sched_wall_time_change(esdp, 1);
                 }
             }
-            else if (!ERTS_SCHEDULER_IS_DIRTY(esdp) && prepare_for_sys_schedule()) {
+            else if (!ERTS_SCHEDULER_IS_DIRTY(esdp) && prepare_for_sys_schedule(esdp)) {
                 /* We sleep in check_io, only for normal schedulers */
                 if (thr_prgr_active) {
                     erts_thr_progress_active(erts_thr_prgr_data(esdp), thr_prgr_active = 0);
@@ -5985,6 +5990,12 @@ init_scheduler_data(ErtsSchedulerData* esdp, int num,
     esdp->io.out = (Uint64) 0;
     esdp->io.in = (Uint64) 0;
 
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+    for (int i = 0; i < sizeof(esdp->nif_select_fds) / sizeof(ErtsSysFdType); i++) {
+        esdp->nif_select_fds[i] = ERTS_SYS_FD_INVALID;
+    }
+#endif
+
     esdp->pending_signal.sig = NULL;
     esdp->pending_signal.to = THE_NON_VALUE;
 #ifdef DEBUG
@@ -9834,7 +9845,7 @@ Process *erts_schedule(ErtsSchedulerData *esdp, Process *p, int calls)
 	    goto check_activities_to_run;
 	} else if (is_normal_sched &&
                    fcalls > (2 * context_reds) &&
-                   prepare_for_sys_schedule()) {
+                   prepare_for_sys_schedule(esdp)) {
             ErtsMonotonicTime current_time;
 	    /*
 	     * Schedule system-level activities.
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index a3fb22b84442..73c4b7dba0aa 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -71,6 +71,9 @@ typedef struct process Process;
 #include "erl_thr_progress.h"
 #undef ERL_THR_PROGRESS_TSD_TYPE_ONLY
 
+// Included for ERTS_POLL_USE_SCHEDULER_POLLING
+#include "erl_poll.h"
+
 #define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT	0
 #define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT		0
 
@@ -721,6 +724,9 @@ struct ErtsSchedulerData_ {
 	Uint64 out;
 	Uint64 in;
     } io;
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+    ErtsSysFdType nif_select_fds[5]; /* Used by check io */
+#endif
     struct {
         ErtsSignal* sig;
         Eterm to;
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index c172ac0df50a..82ee425a82d5 100644
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -142,7 +142,7 @@ extern Eterm erts_nif_call_function(Process *p, Process *tracee,
 
 int erts_call_dirty_nif(ErtsSchedulerData *esdp, Process *c_p,
                         ErtsCodePtr I, Eterm *reg);
-ErtsMessage* erts_create_message_from_nif_env(ErlNifEnv* msg_env);
+ErtsMessage* erts_create_message_from_nif_env(ErlNifEnv* msg_env, Uint extra);
 
 
 /* Driver handle (wrapper for old plain handle) */
diff --git a/erts/emulator/sys/common/erl_check_io.c b/erts/emulator/sys/common/erl_check_io.c
index 67f9978e845d..48a34fc97b1e 100644
--- a/erts/emulator/sys/common/erl_check_io.c
+++ b/erts/emulator/sys/common/erl_check_io.c
@@ -35,9 +35,11 @@
 #include "sys.h"
 #include "global.h"
 #include "erl_port.h"
+#include "erl_port_task.h"
 #include "erl_check_io.h"
 #include "erl_thr_progress.h"
 #include "erl_bif_unique.h"
+#include "erl_proc_sig_queue.h"
 #include "dtrace-wrapper.h"
 #include "lttng-wrapper.h"
 #define ERTS_WANT_TIMER_WHEEL_API
@@ -85,17 +87,19 @@ typedef enum {
                                            to scheduler pollset */
     ERTS_EV_FLAG_IN_SCHEDULER  = 0x4,   /* Set when the fd is currently in
                                            scheduler pollset */
+    ERTS_EV_FLAG_NIF_SELECT    = 0x8,   /* Set if a nif select message is in-flight */
 #else
     ERTS_EV_FLAG_SCHEDULER     = ERTS_EV_FLAG_CLEAR,
     ERTS_EV_FLAG_IN_SCHEDULER  = ERTS_EV_FLAG_CLEAR,
+    ERTS_EV_FLAG_NIF_SELECT    = ERTS_EV_FLAG_CLEAR,
 #endif
 #ifdef ERTS_POLL_USE_FALLBACK
-    ERTS_EV_FLAG_FALLBACK      = 0x8,   /* Set when kernel poll rejected fd
+    ERTS_EV_FLAG_FALLBACK      = 0x10,  /* Set when kernel poll rejected fd
                                            and it was put in the nkp version */
 #else
     ERTS_EV_FLAG_FALLBACK      = ERTS_EV_FLAG_CLEAR,
 #endif
-    ERTS_EV_FLAG_WANT_ERROR    = 0x10,  /* ERL_NIF_SELECT_ERROR turned on */
+    ERTS_EV_FLAG_WANT_ERROR    = 0x20,  /* ERL_NIF_SELECT_ERROR turned on */
 
     /* Combinations, defined only to be displayed by debugger (gdb) */
     ERTS_EV_FLAG_USED_FALLBACK = ERTS_EV_FLAG_USED | ERTS_EV_FLAG_FALLBACK,
@@ -115,9 +119,14 @@ static const char* event_state_flag_to_str(EventStateFlags f, const char *buff,
     case ERTS_EV_FLAG_FALLBACK | ERTS_EV_FLAG_USED: return "USED|FLBK";
 
 #if ERTS_POLL_USE_SCHEDULER_POLLING
+    case ERTS_EV_FLAG_CLEAR | ERTS_EV_FLAG_NIF_SELECT: return "CLEAR|NIF_SELECT";
+    case ERTS_EV_FLAG_USED | ERTS_EV_FLAG_NIF_SELECT: return "USED|NIF_SELECT";
     case ERTS_EV_FLAG_SCHEDULER: return "SCHD";
     case ERTS_EV_FLAG_SCHEDULER | ERTS_EV_FLAG_USED: return "USED|SCHD";
+    case ERTS_EV_FLAG_SCHEDULER | ERTS_EV_FLAG_NIF_SELECT: return "SCHD|NIF_SELECT";
     case ERTS_EV_FLAG_SCHEDULER | ERTS_EV_FLAG_IN_SCHEDULER: return "IN_SCHD";
+    case ERTS_EV_FLAG_SCHEDULER | ERTS_EV_FLAG_IN_SCHEDULER |
+        ERTS_EV_FLAG_NIF_SELECT: return "IN_SCHD|NIF_SELECT";
     case ERTS_EV_FLAG_SCHEDULER | ERTS_EV_FLAG_IN_SCHEDULER
         | ERTS_EV_FLAG_USED: return "USED|IN_SCHD";
 #endif
@@ -178,6 +187,10 @@ typedef struct {
      *   - When a driver_select has triggered, but ready_input has not yet been called.
      *     In this scenario `events` will have the event, but it will not be active
      *     until erts_io_notify_port_task_executed has been called.
+     *   - When an FD has been migrated to the sched_pollset, but has then been
+     *     deselected. In this scenario the FD will remain in the scheduler pollset
+     *     until it is triggered. In this scenario the event till be active, but not
+     *     part of `events`.
      */
     ErtsPollEvents active_events;
     EventStateType type;
@@ -351,6 +364,7 @@ static ERTS_INLINE void
 check_fd_cleanup(ErtsDrvEventState *state,
 		 ErtsDrvSelectDataState **free_select,
                  ErtsNifSelectDataState **free_nif);
+static void clear_select_event(struct erts_nif_select_event* e);
 static ERTS_INLINE void iready(Eterm id, ErtsDrvEventState *state);
 static ERTS_INLINE void oready(Eterm id, ErtsDrvEventState *state);
 #ifdef DEBUG_PRINT_MODE
@@ -662,7 +676,13 @@ abort_tasks(ErtsDrvEventState *state, int mode)
     }
 }
 
+typedef struct {
+    Eterm message;
+    ErlNifEvent evt;
+} ErtsNifSelectSignalData;
+
 static void prepare_select_msg(struct erts_nif_select_event* e,
+                               ErlNifEvent evt,
                                enum ErlNifSelectFlags mode,
                                Eterm recipient,
                                ErtsResource* resource,
@@ -671,32 +691,34 @@ static void prepare_select_msg(struct erts_nif_select_event* e,
                                Eterm event_atom)
 {
     ErtsMessage* mp;
-    Eterm* hp;
-    Uint hsz;
+    ErtsNifSelectSignalData* xsig;
+    Eterm* hp, *hp_start;
+    Uint hsz = sizeof(ErtsNifSelectSignalData) / sizeof(Eterm); /* size of msg ptr */
 
-    if (is_not_nil(e->pid)) {
-        ASSERT(e->mp);
-        erts_cleanup_messages(e->mp);
-    }
+
+    clear_select_event(e);
 
     if (mode & ERL_NIF_SELECT_CUSTOM_MSG) {
         if (msg_env) {
-            mp = erts_create_message_from_nif_env(msg_env);
+            mp = erts_create_message_from_nif_env(msg_env, hsz);
             ERL_MESSAGE_TERM(mp) = msg;
+            hp = &mp->hfrag.mem[0];
+            hp_start = hp;
         }
         else {
-            hsz = size_object(msg);
+            Eterm msgsz = size_object(msg);
+            hsz += msgsz;
             mp = erts_alloc_message(hsz, &hp);
-            ERL_MESSAGE_TERM(mp) = copy_struct(msg, hsz, &hp, &mp->hfrag.off_heap);
+            hp_start = hp;
+            ERL_MESSAGE_TERM(mp) = copy_struct(msg, msgsz, &hp, &mp->hfrag.off_heap);
         }
     }
     else {
         ErtsBinary* bin;
         Eterm resource_term, ref_term, tuple;
-        Eterm* hp_start;
 
          /* {select, Resource, Ref, EventAtom} */
-        hsz = 5 + ERTS_MAGIC_REF_THING_SIZE;
+        hsz += 5 + ERTS_MAGIC_REF_THING_SIZE;
         if (is_internal_ref(msg))
             hsz += ERTS_REF_THING_SIZE;
         else
@@ -720,7 +742,19 @@ static void prepare_select_msg(struct erts_nif_select_event* e,
         tuple = TUPLE4(hp, am_select, resource_term, ref_term, event_atom);
         hp += 5;
         ERL_MESSAGE_TERM(mp) = tuple;
-        ASSERT(hp == hp_start + hsz); (void)hp_start;
+        ASSERT(hp == hp_start + hsz - sizeof(ErtsNifSelectSignalData)/sizeof(Eterm));
+    }
+
+    mp->hfrag.used_size = hp - hp_start;
+
+    if (mode & ERL_NIF_SELECT_READ && ERTS_POLL_USE_SCHEDULER_POLLING) {
+        /* Save original msg ptr */
+        xsig = (ErtsNifSelectSignalData*)hp;
+        xsig->message = ERL_MESSAGE_TERM(mp);
+        xsig->evt = evt;
+        /* Setting to THE_NON_VALUE so that
+           erts_proc_sig_send_nif_select puts proper signal tag here */
+        ERL_MESSAGE_TERM(mp) = THE_NON_VALUE;
     }
 
     ASSERT(is_not_nil(recipient));
@@ -728,17 +762,85 @@ static void prepare_select_msg(struct erts_nif_select_event* e,
     e->mp = mp;
 }
 
-static ERTS_INLINE void send_select_msg(struct erts_nif_select_event* e)
-{
-    Process* rp = erts_proc_lookup(e->pid);
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+static void
+erts_io_clear_nif_select(ErtsSysFdType fd, ErtsDrvEventState *state) {
+    erts_mtx_t *mtx = NULL;
+    if (!state) {
+        mtx = fd_mtx(fd);
+        erts_mtx_lock(mtx);
+        state = get_drv_ev_state(fd);
+    }
+    if (state->flags & ERTS_EV_FLAG_NIF_SELECT) {
+        DEBUG_PRINT_FD("Clear ERTS_EV_FLAG_NIF_SELECT (%d) on sched %d", state,
+            erts_atomic_read_nob(&erts_port_task_outstanding_io_tasks),
+            erts_get_scheduler_id());
+        state->flags &= ~ERTS_EV_FLAG_NIF_SELECT;
+        erts_port_task_dec_outstanding_io_tasks();
+    }
+    if (mtx)
+        erts_mtx_unlock(mtx);
+}
 
-    ASSERT(is_internal_pid(e->pid));
-    if (!rp) {
-        erts_cleanup_messages(e->mp);
-        return;
+void
+erts_io_clear_nif_select_handles(ErtsSchedulerData *esdp) {
+    for (int i = 0; i < sizeof(esdp->nif_select_fds) / sizeof(ErtsSysFdType); i++) {
+        if (esdp->nif_select_fds[i] != ERTS_SYS_FD_INVALID) {
+            DEBUG_PRINT("%d: Clear in sched %d's state", esdp->nif_select_fds[i], esdp->no);
+            erts_io_clear_nif_select(esdp->nif_select_fds[i], NULL);
+            esdp->nif_select_fds[i] = ERTS_SYS_FD_INVALID;
+        }
+    }
+}
+#endif
+
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+Eterm
+erts_io_handle_nif_select(ErtsMessage *sig) {
+    ErtsNifSelectSignalData* xsig = (ErtsNifSelectSignalData*) (char *) (&sig->hfrag.mem[0]
+                                            + sig->hfrag.used_size);
+    ASSERT(sig->hfrag.used_size < sig->hfrag.alloc_size);
+
+    if (erts_sched_poll_enabled()) {
+        ErtsSchedulerData *esdp = erts_get_scheduler_data();
+        for (int i = 0; i < sizeof(esdp->nif_select_fds) / sizeof(ErtsSysFdType); i++) {
+            if (esdp->nif_select_fds[i] == ERTS_SYS_FD_INVALID) {
+                DEBUG_PRINT("%d: Put in sched %d's state", xsig->evt, esdp->no);
+                esdp->nif_select_fds[i] = xsig->evt;
+                goto done;
+            }
+        }
+        /* There were no slots left, clear the flag and reset the poll counter */
+        erts_io_clear_nif_select(xsig->evt, NULL);
     }
+    done:
+    return xsig->message;
+}
+#endif
 
-    erts_queue_message(rp, 0, e->mp, ERL_MESSAGE_TERM(e->mp), am_system);
+static ERTS_INLINE void send_select_msg(struct erts_nif_select_event* e)
+{
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+    if (ERL_MESSAGE_TERM(e->mp) == THE_NON_VALUE) {
+        /* If message term is THE_NON_VALUE this should be sent as a proc signal */
+        if (!erts_proc_sig_send_nif_select(e->pid, e->mp)) {
+            ErtsNifSelectSignalData* xsig = (ErtsNifSelectSignalData*) (char *) (&e->mp->hfrag.mem[0]
+                                            + e->mp->hfrag.used_size);
+            erts_io_clear_nif_select(xsig->evt, NULL);
+            clear_select_event(e);
+        }
+    } else
+#endif
+    {
+        /* Otherwise send as normal message */
+        Process* rp = erts_proc_lookup(e->pid);
+        ASSERT(is_value(ERL_MESSAGE_TERM(e->mp)));
+        if (!rp) {
+            clear_select_event(e);
+            return;
+        }
+        erts_queue_message(rp, 0, e->mp, ERL_MESSAGE_TERM(e->mp), am_system);
+    }
 }
 
 static void clear_select_event(struct erts_nif_select_event* e)
@@ -746,6 +848,11 @@ static void clear_select_event(struct erts_nif_select_event* e)
     if (is_not_nil(e->pid)) {
         /* Discard unsent message */
         ASSERT(e->mp);
+        if (ERL_MESSAGE_TERM(e->mp) == THE_NON_VALUE) {
+            ErtsNifSelectSignalData* xsig = (ErtsNifSelectSignalData*) (char *) (&e->mp->hfrag.mem[0]
+                                            + e->mp->hfrag.used_size);
+            ERL_MESSAGE_TERM(e->mp) = xsig->message;
+        }
         erts_cleanup_messages(e->mp);
         e->mp = NULL;
         e->pid = NIL;
@@ -782,6 +889,9 @@ deselect(ErtsDrvEventState *state, int mode)
         erts_io_control(state, ERTS_POLL_OP_DEL, 0);
 	switch (state->type) {
         case ERTS_EV_TYPE_NIF:
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            erts_io_clear_nif_select(state->fd, state);
+#endif
             clear_select_event(&state->driver.nif->in);
             clear_select_event(&state->driver.nif->out);
             clear_select_event(&state->driver.nif->err);
@@ -800,6 +910,7 @@ deselect(ErtsDrvEventState *state, int mode)
 	}
 	state->type = ERTS_EV_TYPE_NONE;
 	state->flags = ERTS_EV_FLAG_CLEAR;
+        state->count = 0;
     } else {
         ErtsPollEvents new_events =
             erts_io_control(state, ERTS_POLL_OP_MOD, state->active_events);
@@ -967,7 +1078,6 @@ driver_select(ErlDrvPort ix, ErlDrvEvent e, int mode, int on)
 	ctl_events |= ERTS_POLL_EV_OUT;
     }
 
-
     ASSERT((state->type == ERTS_EV_TYPE_DRV_SEL) ||
 	   (state->type == ERTS_EV_TYPE_NONE && !state->events));
 
@@ -1126,6 +1236,7 @@ enif_select_x(ErlNifEnv* env,
     enum { NO_STOP=0, CALL_STOP, CALL_STOP_AND_RELEASE } call_stop = NO_STOP;
     ErtsDrvSelectDataState *free_select = NULL;
     ErtsNifSelectDataState *free_nif = NULL;
+    ErtsPollEvents new_events = 0;
 
     ASSERT(!erts_dbg_is_resource_dying(resource));
 
@@ -1201,15 +1312,13 @@ enif_select_x(ErlNifEnv* env,
         erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf();
         print_nif_select_op(dsbufp, fd, mode, resource, msg);
         steal_pending_stop_nif(dsbufp, resource, state, mode, on);
-        if (state->type == ERTS_EV_TYPE_STOP_NIF) {
-            ret = ERL_NIF_SELECT_STOP_SCHEDULED;  /* ?? */
-            goto done;
-        }
+        ret = ERL_NIF_SELECT_STOP_SCHEDULED;
+        goto done;
+    }
+    default:
         ASSERT(state->type == ERTS_EV_TYPE_NONE);
         break;
     }
-    default: break;
-    }
 
     ASSERT((state->type == ERTS_EV_TYPE_NIF) ||
 	   (state->type == ERTS_EV_TYPE_NONE && !state->events));
@@ -1217,11 +1326,33 @@ enif_select_x(ErlNifEnv* env,
     old_events = state->events;
 
     if (on) {
+        if (state->type == ERTS_EV_TYPE_NONE)
+            ctl_op = ERTS_POLL_OP_ADD;
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+        else {    
+            if (!(state->flags & ERTS_EV_FLAG_SCHEDULER) &&
+                !(state->flags & ERTS_EV_FLAG_FALLBACK) &&
+                 (ctl_events & ERTS_POLL_EV_IN)) {
+                if (erts_sched_poll_enabled() && (state->flags & ERTS_EV_FLAG_NIF_SELECT) &&
+                    state->count++ > 10) {
+                    int wake_poller = 0;
+                    DEBUG_PRINT_FD("moving to scheduler ps", state);
+                    new_events = erts_poll_control(get_scheduler_pollset(), fd, ERTS_POLL_OP_ADD,
+                        ERTS_POLL_EV_IN, &wake_poller);
+                    state->flags |= ERTS_EV_FLAG_SCHEDULER|ERTS_EV_FLAG_IN_SCHEDULER;
+                    state->events |= ERTS_POLL_EV_IN;
+                    state->active_events |= ERTS_POLL_EV_IN;
+                    old_events = state->events;
+                }
+            }
+            if (ctl_events & ERTS_POLL_EV_IN) {
+                erts_io_clear_nif_select(fd, state);
+            }
+        }
+#endif
         ctl_events &= ~old_events;
         state->events |= ctl_events;
         state->active_events |= ctl_events;
-        if (state->type == ERTS_EV_TYPE_NONE)
-            ctl_op = ERTS_POLL_OP_ADD;
         if (ctl_events & ERTS_POLL_EV_ERR)
             state->flags |= ERTS_EV_FLAG_WANT_ERROR;
     }
@@ -1229,29 +1360,36 @@ enif_select_x(ErlNifEnv* env,
         ctl_events &= old_events;
         state->events &= ~ctl_events;
         state->active_events &= ~ctl_events;
+        if (ctl_op == ERTS_POLL_OP_DEL && state->flags & ERTS_EV_FLAG_IN_SCHEDULER) {
+            ASSERT(state->active_events & ERTS_POLL_EV_IN || ctl_events & ERTS_POLL_EV_IN);
+            state->active_events &= ~ERTS_POLL_EV_IN;
+        }
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+        erts_io_clear_nif_select(fd, state);
+#endif
     }
 
     if (ctl_events || ctl_op == ERTS_POLL_OP_DEL) {
-        ErtsPollEvents new_events;
 
         new_events = erts_io_control_wakeup(state,
                                             ctl_op,
                                             state->active_events,
                                             &wake_poller);
 
-        if (new_events & ERTS_POLL_EV_NVAL) {
-            if (state->type == ERTS_EV_TYPE_NIF && !old_events) {
-                state->type = ERTS_EV_TYPE_NONE;
-                state->flags = 0;
-                state->driver.nif->in.pid = NIL;
-                state->driver.nif->out.pid = NIL;
-                state->driver.nif->err.pid = NIL;
-                state->driver.stop.resource = NULL;
-            }
-            ret = INT_MIN | ERL_NIF_SELECT_FAILED;
-            goto done;
+        ASSERT(new_events == state->active_events || new_events & ERTS_POLL_EV_NVAL);
+    }
+
+    if (new_events & ERTS_POLL_EV_NVAL) {
+        if (state->type == ERTS_EV_TYPE_NIF && !old_events) {
+            state->type = ERTS_EV_TYPE_NONE;
+            state->flags = 0;
+            state->driver.nif->in.pid = NIL;
+            state->driver.nif->out.pid = NIL;
+            state->driver.nif->err.pid = NIL;
+            state->driver.stop.resource = NULL;
         }
-        ASSERT(new_events == state->events);
+        ret = INT_MIN | ERL_NIF_SELECT_FAILED;
+        goto done;
     }
 
     ASSERT(state->type == ERTS_EV_TYPE_NIF
@@ -1270,17 +1408,17 @@ enif_select_x(ErlNifEnv* env,
         ASSERT(state->type == ERTS_EV_TYPE_NIF);
         ASSERT(state->driver.stop.resource == resource);
         if (mode & ERL_DRV_READ) {
-            prepare_select_msg(&state->driver.nif->in, mode, recipient,
+            prepare_select_msg(&state->driver.nif->in, e, mode, recipient,
                                resource, msg, msg_env, am_ready_input);
             msg_env = NULL;
         }
         if (mode & ERL_DRV_WRITE) {
-            prepare_select_msg(&state->driver.nif->out, mode, recipient,
+            prepare_select_msg(&state->driver.nif->out, e, mode, recipient,
                                resource, msg, msg_env, am_ready_output);
             msg_env = NULL;
         }
         if (mode & ERL_NIF_SELECT_ERROR) {
-            prepare_select_msg(&state->driver.nif->err, mode, recipient,
+            prepare_select_msg(&state->driver.nif->err, e, mode, recipient,
                                resource, msg, msg_env, am_ready_error);
         }
         ret = 0;
@@ -1333,6 +1471,7 @@ enif_select_x(ErlNifEnv* env,
                 state->type = ERTS_EV_TYPE_STOP_NIF;
                 ret |= ERL_NIF_SELECT_STOP_SCHEDULED;
             }
+            state->count = 0;
             state->flags &= ~ERTS_EV_FLAG_WANT_ERROR;
         }
         else
@@ -1798,6 +1937,9 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
 			  erl_errno_id(poll_ret), poll_ret);
 	    erts_send_error_to_logger_nogl(dsbufp);
 	}
+        // if (is_normal_sched) {
+        //     erts_fprintf(stderr, "%d: woke up\r\n", esdp->no);
+        // }
         ERTS_MSACC_POP_STATE();
 	return;
     }
@@ -1840,11 +1982,11 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
             state->active_events = 0;
 
 #if ERTS_POLL_USE_SCHEDULER_POLLING
-            if (psi->ps == get_scheduler_pollset() && state->flags & ERTS_EV_FLAG_SCHEDULER) {
+            if (state->flags & ERTS_EV_FLAG_SCHEDULER) {
                 /* In the poll thread, this fd would have been disabled due to ONESHOT,
-                   but in the scheduler pollset it needs to be disabled. */
+                   but in the scheduler pollset it needs to be disabled manually. */
                 int wake_poller = 0;
-                erts_poll_control(psi->ps, fd, ERTS_POLL_OP_DEL, 0, &wake_poller);
+                erts_poll_control(get_scheduler_pollset(), fd, ERTS_POLL_OP_DEL, 0, &wake_poller);
                 state->flags &= ~(ERTS_EV_FLAG_SCHEDULER|ERTS_EV_FLAG_IN_SCHEDULER);
                 state->count = 0;
             }
@@ -1958,6 +2100,16 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
                 check_fd_cleanup(state, &free_select, &free_nif);
             }
 
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            if (erts_sched_poll_enabled() && is_not_nil(in_ev.pid)) {
+                DEBUG_PRINT_FD("Set ERTS_EV_FLAG_NIF_SELECT on sched %d",
+                        state, erts_get_scheduler_id());
+                ASSERT(ERL_MESSAGE_TERM(in_ev.mp) == THE_NON_VALUE);
+                state->flags |= ERTS_EV_FLAG_NIF_SELECT;
+                erts_port_task_inc_outstanding_io_tasks();
+            }
+#endif
+
             erts_mtx_unlock(fd_mtx(fd));
 
             if (is_not_nil(in_ev.pid)) {
@@ -1973,6 +2125,13 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
         }
 
         case ERTS_EV_TYPE_STOP_NIF: {
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            if (psi->ps == get_scheduler_pollset())
+                break;
+#endif
+#if ERTS_POLL_USE_FALLBACK
+            ASSERT(psi->ps == get_fallback_pollset());
+#endif
             resource = state->driver.stop.resource;
             state->type = ERTS_EV_TYPE_NONE;
             goto case_ERTS_EV_TYPE_NONE;
@@ -1992,6 +2151,9 @@ erts_check_io(ErtsPollThread *psi, ErtsMonotonicTime timeout_time, int poll_only
 	case ERTS_EV_TYPE_NONE: /* Deselected ... */
         case_ERTS_EV_TYPE_NONE:
             state->flags &= ~ERTS_EV_FLAG_FALLBACK;
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+            erts_io_clear_nif_select(state->fd, state);
+#endif
             ASSERT(!state->events && !state->active_events && !state->flags);
             check_fd_cleanup(state, &free_select, &free_nif);
 	    break;
diff --git a/erts/emulator/sys/common/erl_check_io.h b/erts/emulator/sys/common/erl_check_io.h
index d44b494223f3..96cd2732d417 100644
--- a/erts/emulator/sys/common/erl_check_io.h
+++ b/erts/emulator/sys/common/erl_check_io.h
@@ -58,6 +58,23 @@ Eterm erts_check_io_info(void *proc);
 void erts_io_notify_port_task_executed(ErtsPortTaskType type,
                                        ErtsPortTaskHandle *handle,
                                        void (*reset)(ErtsPortTaskHandle *));
+
+#if ERTS_POLL_USE_SCHEDULER_POLLING
+/**
+ * Handles an nif select non-message signal. Returns the message Eterm.
+ * 
+ * @param sig The signal that has arrived
+ */
+Eterm erts_io_handle_nif_select(ErtsMessage *sig);
+
+/**
+ * Clears all nif select handles from scheduler queue.
+ * 
+ * @param sig The signal that has arrived
+ */
+void erts_io_clear_nif_select_handles(ErtsSchedulerData *esdp);
+#endif
+
 /**
  * Returns the maximum number of fds that the check io framework can handle.
  */
diff --git a/erts/emulator/test/driver_SUITE.erl b/erts/emulator/test/driver_SUITE.erl
index 1a512e054490..06206d0a2588 100644
--- a/erts/emulator/test/driver_SUITE.erl
+++ b/erts/emulator/test/driver_SUITE.erl
@@ -89,6 +89,7 @@
 -export([bin_prefix/2]).
 
 -export([get_check_io_total/1]).   % for z_SUITE.erl
+-export([check_io_debug/0]). %% For nif_SUITE.erl
 
 -include_lib("common_test/include/ct.hrl").
 
diff --git a/erts/emulator/test/nif_SUITE.erl b/erts/emulator/test/nif_SUITE.erl
index ea5ffbff25d0..9701ee0b614c 100644
--- a/erts/emulator/test/nif_SUITE.erl
+++ b/erts/emulator/test/nif_SUITE.erl
@@ -37,8 +37,8 @@
          t_load_race/1,
          t_call_nif_early/1,
          load_traced_nif/1,
-         select/1, select_steal/1,
-	 select_error/1,
+         select/1, select_scheduler/1,
+         select_steal/1, select_error/1,
          monitor_process_a/1,
          monitor_process_b/1,
          monitor_process_c/1,
@@ -154,6 +154,7 @@
        select_nif/6,
        dupe_resource_nif/1,
        pipe_nif/0,
+       socketpair_nif/0,
        write_nif/2,
        read_nif/2,
        close_nif/1,
@@ -268,6 +269,7 @@ groups() ->
                     monitor_process_purge,
                     demonitor_process]},
      {select, [], [select,
+                   select_scheduler,
 		   select_error,
 		   select_steal]}].
 
@@ -298,7 +300,8 @@ init_per_testcase(nif_whereis_threaded, Config) ->
         false -> {skip, "No thread support"}
     end;
 init_per_testcase(Select, Config) when Select =:= select;
-                                       Select =:= select_steal ->
+                                       Select =:= select_steal;
+                                       Select =:= select_scheduler ->
     case os:type() of
         {win32,_} ->
             {skip, "Test not yet implemented for windows"};
@@ -318,6 +321,7 @@ end_per_testcase(_Func, _Config) ->
     testcase_cleanup().
 
 testcase_cleanup() ->
+    driver_SUITE:check_io_debug(),
     P1 = code:purge(nif_mod),
     Del = code:delete(nif_mod),
     P2 = code:purge(nif_mod),
@@ -1003,6 +1007,7 @@ select_2(Flag, Ref1, Ref2, MSG_ENV) ->
 select_3() ->
     erlang:garbage_collect(),
     {_,_,2} = last_resource_dtor_call(),
+
     ok.
 
 receive_ready(R, Ref, IOatom) when is_reference(Ref) ->
@@ -1011,6 +1016,177 @@ receive_ready(_, Msg, _) ->
     [Got] = flush(),
     {true,_,_} = {Got=:=Msg, Got, Msg}.
 
+select_scheduler(Config) ->
+    ensure_lib_loaded(Config),
+
+    RefBin = list_to_binary(lists:duplicate(100, $x)),
+
+    select_scheduler_do(0, make_ref(), null),
+    select_scheduler_do(?ERL_NIF_SELECT_CUSTOM_MSG, [a, "list", RefBin], null),
+    select_scheduler_do(?ERL_NIF_SELECT_CUSTOM_MSG, [a, "list", RefBin], alloc_env),
+
+    case has_scheduler_pollset() of
+        true ->
+            {ok, Peer, Node} = ?CT_PEER(#{ args => ["+IOs","false"]}),
+
+            erpc:call(Node, fun() ->
+                                    ensure_lib_loaded(Config),
+                                    select_scheduler_do(0, make_ref(), null),
+                                    select_scheduler_do(?ERL_NIF_SELECT_CUSTOM_MSG, [a, "list", RefBin], null),
+                                    select_scheduler_do(?ERL_NIF_SELECT_CUSTOM_MSG, [a, "list", RefBin], alloc_env)
+                            end),
+
+            peer:stop(Peer);
+        _ ->
+            ok
+    end.
+
+%% This testcase tests so that scheduler polling works as it should for NIFs
+select_scheduler_do(Flag, Ref, MSG_ENV) ->
+
+    OriginalSchedPollFds = get_scheduler_pollset_size(),
+    SchedulerFDs = case has_scheduler_pollset() of
+            true -> 1;
+            false -> 0
+    end,
+
+    {{R, _R_ptr}, {W, W_ptr}} = socketpair_nif(),
+    ok = write_nif(W, <<"hej">>),
+    <<"hej">> = read_nif(R, 3),
+
+    %% Fill the output buffers and setup a select
+    FullData = write_full(R, $a),
+    select_nif(R, ?ERL_NIF_SELECT_WRITE bor Flag, R, self(), Ref, MSG_ENV),
+
+    eagain = read_nif(R, 3),
+
+    %% Move FD to scheduler pollset
+    move_fd_to_scheduler_pollset(W, R, Flag, Ref, MSG_ENV),
+    ?assertEqual(OriginalSchedPollFds + SchedulerFDs, get_scheduler_pollset_size()),
+    
+    %% Write without select, means migrate back to poll thread
+    ok = write_nif(W, <<"hej">>),
+    %% Make sure schedulers sleeps, triggering migration back to poll thread
+    timer:sleep(10),
+    <<"hej">> = read_all_nif(R, 3),
+
+    ?assertEqual(OriginalSchedPollFds, get_scheduler_pollset_size()),
+
+    %% Move FD to scheduler pollset again
+    move_fd_to_scheduler_pollset(W, R, Flag, Ref, MSG_ENV),
+    ?assertEqual(OriginalSchedPollFds + SchedulerFDs, get_scheduler_pollset_size()),
+
+    %% Check that we get WRITE select event if read FullData
+    FullData = read_all_nif(W, byte_size(FullData)),
+    receive_ready(R, Ref, ready_output),
+    ?assertEqual(OriginalSchedPollFds + SchedulerFDs, get_scheduler_pollset_size()),
+
+    %% Check that we can do a WRITE select when READ is on scheduler pollset
+    FullDataAgain = write_full(R, $b),
+    select_nif(R, ?ERL_NIF_SELECT_WRITE bor Flag, R, self(), Ref, MSG_ENV),
+    FullDataAgain = read_nif(W, byte_size(FullDataAgain)),
+    receive_ready(R, Ref, ready_output),
+    ?assertEqual(OriginalSchedPollFds + SchedulerFDs, get_scheduler_pollset_size()),
+
+    %% Check that we can de-select on READ when in scheduler pollset
+    0 = select_nif(R, ?ERL_NIF_SELECT_READ bor Flag, R, self(), Ref, MSG_ENV),
+    ?assertEqual(OriginalSchedPollFds + SchedulerFDs, get_scheduler_pollset_size()),
+    ?ERL_NIF_SELECT_READ_CANCELLED =
+        select_nif(R, ?ERL_NIF_SELECT_READ bor ?ERL_NIF_SELECT_CANCEL, R, self(), Ref, MSG_ENV),
+    ?assertEqual(OriginalSchedPollFds + SchedulerFDs, get_scheduler_pollset_size()),
+    ok = write_nif(W, <<"hej">>),
+    <<"hej">> = read_all_nif(R, 3),
+    [] = flush(0),
+
+    %% Close write side, while read side is in scheduler pollset
+    check_stop_ret(select_nif(W, ?ERL_NIF_SELECT_STOP, W, null, Ref, null)),
+    [{fd_resource_stop, W_ptr, _}] = flush(),
+    {1, {W_ptr,_}} = last_fd_stop_call(),
+    true = is_closed_nif(W),
+    [] = flush(0),
+    0 = select_nif(R, ?ERL_NIF_SELECT_READ bor Flag, R, self(), Ref, MSG_ENV),
+    receive_ready(R, Ref, ready_input),
+    eof = read_nif(R,1),
+
+    check_stop_ret(select_nif(R, ?ERL_NIF_SELECT_STOP, R, null, Ref, null)),
+    [{fd_resource_stop, R_ptr, _}] = flush(),
+    {1, {R_ptr,_}} = last_fd_stop_call(),
+    true = is_closed_nif(R),
+    [] = flush(0),
+
+    ?assertEqual(OriginalSchedPollFds, get_scheduler_pollset_size()),
+
+    %% We setup 10 fds in parallel to make sure all end up in scheduler pollset
+    NumberOfFds = 10,
+    Parent = self(),
+    Pids = [spawn_monitor(fun() ->
+                link(Parent),
+                {{R1, _R1_ptr}, {W1, _W1_ptr}} = socketpair_nif(),
+                move_fd_to_scheduler_pollset(W1, R1, Flag, Ref, MSG_ENV),
+                Parent ! self(),
+                receive stop -> ok end,
+                check_stop_ret(select_nif(W1, ?ERL_NIF_SELECT_STOP, W1, null, Ref, null)),
+                check_stop_ret(select_nif(R1, ?ERL_NIF_SELECT_STOP, R1, null, Ref, null))
+            end) || _ <- lists:seq(1,NumberOfFds)],
+    [receive P -> ok end || {P, _} <- Pids],
+    ?assertEqual(OriginalSchedPollFds + NumberOfFds*SchedulerFDs, get_scheduler_pollset_size()),
+    [begin P ! stop, receive {'DOWN', Ref1, _, _, _} -> ok end end || {P, Ref1} <- Pids],
+
+    NumberOfClosedFds = NumberOfFds * 2,
+    %% Sleep a bit to let all callback trigger
+    timer:sleep(1000),
+    {NumberOfClosedFds, {_,_}} = last_fd_stop_call(),
+
+    timer:sleep(1000),
+
+    %% Sleep a bit to let the pollset clear out
+    ?assertEqual(OriginalSchedPollFds, get_scheduler_pollset_size()),
+
+    ok.
+
+move_fd_to_scheduler_pollset(W, R, Flag, Ref, MSG_ENV) ->
+    [begin
+        0 = select_nif(R,?ERL_NIF_SELECT_READ bor Flag, R,null,Ref,MSG_ENV),
+        Buf = integer_to_binary(I),
+        ok = write_nif(W, Buf),
+        %% NOTE: If the testcase gets stuck here while running rr, that is
+        %% because the rr looses events for some reason and does not deliver
+        %% them as it is supposed to... so you will have to use good old fashioned
+        %% printf debugging...
+        receive
+            {select, R, Ref, ready_input} ->
+                ok;
+            Msg when Ref =:= Msg ->
+                ok
+        end,
+        Buf = read_all_nif(R, byte_size(Buf))
+     end || I <- lists:seq(1,30)],
+     ok.
+
+read_all_nif(Fd, Count) ->
+    case read_nif(Fd, Count) of
+        Res when byte_size(Res) =:= Count ->
+            Res;
+        Res when byte_size(Res) < Count ->
+            <<Res/binary, (read_all_nif(Fd, Count - byte_size(Res)))/binary>>
+    end.
+
+has_scheduler_pollset() ->
+    lists:search(fun(PS) ->
+        proplists:get_value(fallback, PS) =:= false andalso
+        proplists:get_value(poll_threads, PS) =:= 0
+    end, erlang:system_info(check_io)) =/= false.
+get_scheduler_pollset_size() ->
+    CIO = erlang:system_info(check_io),
+    case lists:search(fun(PS) ->
+            proplists:get_value(fallback, PS) =:= false andalso
+            proplists:get_value(poll_threads, PS) =:= 0
+        end, CIO) of
+        {value, PS} ->
+            proplists:get_value(total_poll_set_size, PS);
+        false ->
+            0
+    end.
 
 select_error(Config) when is_list(Config) ->
     ensure_lib_loaded(Config),
@@ -4431,6 +4607,7 @@ format_term_nif(_,_) -> ?nif_stub.
 select_nif(_,_,_,_,_,_) -> ?nif_stub.
 dupe_resource_nif(_) -> ?nif_stub.
 pipe_nif() -> ?nif_stub.
+socketpair_nif() -> ?nif_stub.
 write_nif(_,_) -> ?nif_stub.
 read_nif(_,_) -> ?nif_stub.
 close_nif(_) -> ?nif_stub.
diff --git a/erts/emulator/test/nif_SUITE_data/nif_SUITE.c b/erts/emulator/test/nif_SUITE_data/nif_SUITE.c
index 9e77ff94ee46..0ad9123aa8ef 100644
--- a/erts/emulator/test/nif_SUITE_data/nif_SUITE.c
+++ b/erts/emulator/test/nif_SUITE_data/nif_SUITE.c
@@ -28,6 +28,8 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
 #endif
 
 #include "nif_mod.h"
@@ -2708,6 +2710,42 @@ static ERL_NIF_TERM pipe_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]
                enif_make_tuple2(env, write_fd, make_pointer(env, write_rsrc)));
 }
 
+/*
+ * Create a read-write socketpair with two fds (to read and to write)
+ */
+static ERL_NIF_TERM socketpair_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+{
+    struct fd_resource* read_rsrc;
+    struct fd_resource* write_rsrc;
+    ERL_NIF_TERM read_fd, write_fd;
+    int fds[2], flags;
+
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) < 0)
+        return enif_make_string(env, "pipe failed", ERL_NIF_LATIN1);
+
+    flags = fcntl(fds[0], F_GETFL, 0);
+    if (flags == -1) return enif_make_badarg(env);
+    if (fcntl(fds[0], F_SETFL, flags | O_NONBLOCK) == -1) return enif_make_badarg(env);
+    flags = fcntl(fds[1], F_GETFL, 0);
+    if (flags == -1) return enif_make_badarg(env);
+    if (fcntl(fds[1], F_SETFL, flags | O_NONBLOCK) == -1) return enif_make_badarg(env);
+
+    read_rsrc  = enif_alloc_resource(fd_resource_type, sizeof(struct fd_resource));
+    write_rsrc = enif_alloc_resource(fd_resource_type, sizeof(struct fd_resource));
+    read_rsrc->fd  = fds[0];
+    read_rsrc->was_selected = 0;
+    write_rsrc->fd = fds[1];
+    write_rsrc->was_selected = 0;
+    read_fd  = enif_make_resource(env, read_rsrc);
+    write_fd = enif_make_resource(env, write_rsrc);
+    enif_release_resource(read_rsrc);
+    enif_release_resource(write_rsrc);
+
+    return enif_make_tuple2(env,
+               enif_make_tuple2(env, read_fd, make_pointer(env, read_rsrc)),
+               enif_make_tuple2(env, write_fd, make_pointer(env, write_rsrc)));
+}
+
 /*
  * Create (dupe) of a resource with the same fd, to test stealing
  */
@@ -2783,7 +2821,7 @@ static ERL_NIF_TERM read_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[]
             }
             return res;
         }
-        else if (n == 0) {
+        else if (n == 0 || errno == ECONNRESET) {
             return atom_eof;
         }
         else if (errno == EAGAIN) {
@@ -3891,6 +3929,7 @@ static ErlNifFunc nif_funcs[] =
     {"select_nif", 6, select_nif},
 #ifndef __WIN32__
     {"pipe_nif", 0, pipe_nif},
+    {"socketpair_nif", 0, socketpair_nif},
     {"write_nif", 2, write_nif},
     {"dupe_resource_nif", 1, dupe_resource_nif},
     {"read_nif", 2, read_nif},

From 674634c96924623857de2c99abfe51aa75b148fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lukas=20Backstr=C3=B6m?= <lukas@erlang.org>
Date: Thu, 6 Feb 2025 15:06:03 +0100
Subject: [PATCH 5/5] erts: Expand debug printout in check io

---
 erts/emulator/sys/common/erl_check_io.c |  58 ++++++---
 erts/etc/unix/etp-commands.in           | 153 ++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 19 deletions(-)

diff --git a/erts/emulator/sys/common/erl_check_io.c b/erts/emulator/sys/common/erl_check_io.c
index 48a34fc97b1e..5f11da97b8d0 100644
--- a/erts/emulator/sys/common/erl_check_io.c
+++ b/erts/emulator/sys/common/erl_check_io.c
@@ -117,6 +117,7 @@ static const char* event_state_flag_to_str(EventStateFlags f, const char *buff,
     case ERTS_EV_FLAG_USED: return "USED";
     case ERTS_EV_FLAG_FALLBACK: return "FLBK";
     case ERTS_EV_FLAG_FALLBACK | ERTS_EV_FLAG_USED: return "USED|FLBK";
+    case ERTS_EV_FLAG_WANT_ERROR: return "WANT_ERROR";
 
 #if ERTS_POLL_USE_SCHEDULER_POLLING
     case ERTS_EV_FLAG_CLEAR | ERTS_EV_FLAG_NIF_SELECT: return "CLEAR|NIF_SELECT";
@@ -223,6 +224,13 @@ int ERTS_WRITE_UNLIKELY(erts_no_pollsets) = 1;
 int ERTS_WRITE_UNLIKELY(erts_no_poll_threads) = 1;
 struct drv_ev_state_shared drv_ev_state;
 
+/* Used by etp */
+ErtsPollEvents etp_poll_ev_none = ERTS_POLL_EV_NONE;
+ErtsPollEvents etp_poll_ev_in = ERTS_POLL_EV_IN;
+ErtsPollEvents etp_poll_ev_out = ERTS_POLL_EV_OUT;
+ErtsPollEvents etp_poll_ev_err = ERTS_POLL_EV_ERR;
+ErtsPollEvents etp_poll_ev_nval = ERTS_POLL_EV_NVAL;
+
 static ERTS_INLINE int fd_hash(ErtsSysFdType fd)
 {
 #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS
@@ -369,7 +377,7 @@ static ERTS_INLINE void iready(Eterm id, ErtsDrvEventState *state);
 static ERTS_INLINE void oready(Eterm id, ErtsDrvEventState *state);
 #ifdef DEBUG_PRINT_MODE
 static char *drvmode2str(int mode);
-static char *nifmode2str(enum ErlNifSelectFlags mode);
+static char *nifmode2str(enum ErlNifSelectFlags mode, const char *buff, int sz);
 #endif
 
 static ERTS_INLINE void
@@ -1237,6 +1245,9 @@ enif_select_x(ErlNifEnv* env,
     ErtsDrvSelectDataState *free_select = NULL;
     ErtsNifSelectDataState *free_nif = NULL;
     ErtsPollEvents new_events = 0;
+#ifdef DEBUG_PRINT_MODE
+    char tmp_buff[255];
+#endif
 
     ASSERT(!erts_dbg_is_resource_dying(resource));
 
@@ -1252,7 +1263,7 @@ enif_select_x(ErlNifEnv* env,
     state = get_drv_ev_state(fd); /* may be NULL! */
 
     DEBUG_PRINT_FD("enif_select(%T, %d, %s, %p, %T, %T)",
-                   state, env->proc->common.id, fd, nifmode2str(mode), resource,
+                   state, env->proc->common.id, fd, nifmode2str(mode, tmp_buff, sizeof(tmp_buff)), resource,
                    pid ? pid->pid : THE_NON_VALUE, THE_NON_VALUE);
 
     if (mode & ERL_NIF_SELECT_STOP) {
@@ -2826,24 +2837,33 @@ drvmode2str(int mode) {
 }
 
 static ERTS_INLINE char *
-nifmode2str(enum ErlNifSelectFlags mode) {
-    if (mode & ERL_NIF_SELECT_STOP)
-        return "STOP";
-    switch (mode) {
-    case ERL_NIF_SELECT_READ: return "READ";
-    case ERL_NIF_SELECT_WRITE: return "WRITE";
-    case ERL_NIF_SELECT_READ|ERL_NIF_SELECT_WRITE: return "READ|WRITE";
-    case ERL_NIF_SELECT_CANCEL|ERL_NIF_SELECT_READ: return "CANCEL|READ";
-    case ERL_NIF_SELECT_CANCEL|ERL_NIF_SELECT_WRITE: return "CANCEL|WRITE";
-    case ERL_NIF_SELECT_CANCEL|ERL_NIF_SELECT_READ|ERL_NIF_SELECT_WRITE:
-        return "CANCEL|READ|WRITE";
-    case ERL_NIF_SELECT_CUSTOM_MSG|ERL_NIF_SELECT_READ: return "CUSTOM|READ";
-    case ERL_NIF_SELECT_CUSTOM_MSG|ERL_NIF_SELECT_WRITE: return "CUSTOM|WRITE";
-    case ERL_NIF_SELECT_CUSTOM_MSG|ERL_NIF_SELECT_READ|ERL_NIF_SELECT_WRITE:
-        return "CUSTOM|READ|WRITE";
-    default: return "UNKNOWN";
+nifmode2str(enum ErlNifSelectFlags mode, const char *orig, int len) {
+    char *prefix = "";
+    int pos = 0;
+    char *buff = (char*)orig;
+
+    #define NIFMODE_PRINT(MODE, NAME) do {                            \
+        if (mode & MODE) {                                            \
+          pos += snprintf(buff + pos, len-pos ,"%s%s", prefix, NAME); \
+          prefix = "|";                                               \
+          mode &= ~MODE;                                              \
+        }                                                             \
+    } while(0)
+
+    NIFMODE_PRINT(ERL_NIF_SELECT_STOP, "STOP");
+    NIFMODE_PRINT(ERL_NIF_SELECT_WRITE, "WRITE");
+    NIFMODE_PRINT(ERL_NIF_SELECT_READ, "READ");
+    NIFMODE_PRINT(ERL_NIF_SELECT_CUSTOM_MSG, "CUSTOM");
+    NIFMODE_PRINT(ERL_NIF_SELECT_CANCEL, "CANCEL");
+    NIFMODE_PRINT(ERL_NIF_SELECT_ERROR, "ERROR");
+
+#undef NIFMODE_PRINT
+
+    if (mode) {
+        snprintf(buff+pos, len-pos, "%sERROR(%d)", prefix, mode);
     }
-}
+    return buff;
+ }
 
 #endif
 
diff --git a/erts/etc/unix/etp-commands.in b/erts/etc/unix/etp-commands.in
index 559b45262cd5..552cdbda1f15 100644
--- a/erts/etc/unix/etp-commands.in
+++ b/erts/etc/unix/etp-commands.in
@@ -62,6 +62,7 @@ document etp-help
 %   etp-heapdump, etp-offheapdump, etpf-offheapdump,
 %   etp-search-heaps, etp-search-alloc,
 %   etp-ets-tables, etp-ets-tabledump
+%   etp-fd-dump
 %
 % Complex commands that use the Erlang support module.
 %   etp-overlapped-heaps, etp-chart, etp-chart-start, etp-chart-end
@@ -4901,6 +4902,158 @@ document etp-ets-tabledump
 %---------------------------------------------------------------------------
 end
 
+define _etp-fd-print-events
+    if $arg0 == 0 || $arg0 == etp_poll_ev_none
+        printf "NONE"
+    else
+    if $arg0 == etp_poll_ev_in
+        printf "IN"
+    else
+    if $arg0 == etp_poll_ev_out
+        printf "OUT"
+    else
+    if $arg0 == (etp_poll_ev_in|etp_poll_ev_out)
+        printf "IN|OUT"
+    else
+    if $arg0 == etp_poll_ev_err
+        printf "ERR"
+    else
+    if $arg0 == etp_poll_ev_nval
+        printf "NVAL"
+    else
+        printf "UNKNOWN(%d)", $arg0
+    end
+    end
+    end
+    end
+    end
+    end
+    printf "\n"
+end
+
+define etp-fd
+    set $etp_fd_state = &drv_ev_state.v[$arg0]
+    printf "fd: %d\n", $arg0
+    
+    printf "  type: "
+    if $etp_fd_state->type == 0
+        printf "NONE\n"
+    end
+    if $etp_fd_state->type == 1
+        printf "DRV_SEL\n"
+    end
+    if $etp_fd_state->type == 2
+        printf "STOP_USE\n"
+    end
+    if $etp_fd_state->type == 3
+        printf "NIF\n"
+    end
+    if $etp_fd_state->type == 4
+        printf "STOP_NIF\n"
+    end
+
+    printf "  events: "
+    _etp-fd-print-events $etp_fd_state->events
+    
+    printf "  active_events: "
+    _etp-fd-print-events $etp_fd_state->active_events
+    
+    printf "  flags: "
+    set $etp_fd_flags = $etp_fd_state->flags
+    if  $etp_fd_flags == 0x0
+        printf "NONE"
+    else
+        if  $etp_fd_flags & 0x1
+            printf "USED "
+            set $etp_fd_flags =  $etp_fd_flags & ~0x1
+        end
+        if  $etp_fd_flags & 0x2
+            printf "SCHEDULER "
+            set $etp_fd_flags =  $etp_fd_flags & ~0x2
+        end
+        if  $etp_fd_flags & 0x4
+            printf "IN_SCHEDULER "
+            set $etp_fd_flags =  $etp_fd_flags & ~0x4
+        end
+        if  $etp_fd_flags & 0x8
+            printf "NIF_SELECT "
+            set $etp_fd_flags =  $etp_fd_flags & ~0x8
+        end
+        if  $etp_fd_flags & 0x10
+            printf "FALLBACK "
+            set $etp_fd_flags =  $etp_fd_flags & ~0x10
+        end
+        if  $etp_fd_flags & 0x20
+            printf "WANT_ERROR "
+            set $etp_fd_flags =  $etp_fd_flags & ~0x20
+        end
+        if $etp_fd_flags
+            printf "ERROR(%d)", $etp_fd_flags
+        end
+    end
+    printf "\n"
+    if $etp_fd_state->driver.select
+        if $etp_fd_state->driver.select->inport != $etp_nil
+            printf "  IN PORT: "
+            etp $etp_fd_state->driver.select->inport
+        end
+        if $etp_fd_state->driver.select->outport != $etp_nil
+            printf "  OUT PORT: "
+            etp $etp_fd_state->driver.select->outport
+        end
+        if $etp_fd_state->driver.select->outiotask.task.counter != 0
+            printf "  OUT TASK: (ErtsPortTask*)%p\n", $etp_fd_state->driver.select->outiotask.task.counter
+        end
+        if $etp_fd_state->driver.select->iniotask.task.counter != 0
+            printf "  IN TASK: (ErtsPortTask*)%p\n", $etp_fd_state->driver.select->iniotask.task.counter
+        end
+    end
+    if $etp_fd_state->driver.nif
+        if $etp_fd_state->driver.nif->in.pid != $etp_nil
+            printf "  IN PID: "
+            etp $etp_fd_state->driver.nif->in.pid
+        end
+        if $etp_fd_state->driver.nif->out.pid != $etp_nil
+            printf "  OUT PID: "
+            etp $etp_fd_state->driver.nif->out.pid
+        end
+    end
+end
+
+document etp-fd
+%---------------------------------------------------------------------------
+% etp-fd FD
+%
+% Dump info about a specific FD
+%---------------------------------------------------------------------------
+end
+
+define etp-fd-dump
+    set $etp_fd_dump_i = 0
+    set $etp_fd_dump_len = drv_ev_state.len.counter
+    while $etp_fd_dump_i < $etp_fd_dump_len
+        set $etp_fd_dump_state = &drv_ev_state.v[$etp_fd_dump_i]
+        set $should_display = $etp_fd_dump_state->flags != 0
+        set $should_display |= $etp_fd_dump_state->events != etp_poll_ev_none && $etp_fd_dump_state->events != 0
+        set $should_display |= $etp_fd_dump_state->active_events != etp_poll_ev_none && $etp_fd_dump_state->active_events != 0
+        set $should_display |= $etp_fd_dump_state->driver.select != 0
+        set $should_display |= $etp_fd_dump_state->driver.nif != 0
+        if  $should_display
+            etp-fd $etp_fd_dump_i
+        end
+        set $etp_fd_dump_i++
+    end
+end
+
+document etp-fd-dump
+%---------------------------------------------------------------------------
+% etp-fd-dump
+%
+% Dump all FDs in the system
+%---------------------------------------------------------------------------
+end
+
+
 define etp-lc-dump
 # Non-reentrant
   set $etp_lc_dump_thread = erts_locked_locks