Netdev Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH v3 1/2] net: add support for threaded NAPI polling
@ 2020-08-21 19:01 Felix Fietkau
2020-08-21 19:01 ` [PATCH v3 2/2] net: add sysfs attribute for enabling threaded NAPI Felix Fietkau
` (2 more replies)
0 siblings, 3 replies; 6+ messages in thread
From: Felix Fietkau @ 2020-08-21 19:01 UTC (permalink / raw)
To: netdev; +Cc: Eric Dumazet, Hillf Danton
For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
poll function does not perform well. Since NAPI poll is bound to the CPU it
was scheduled from, we can easily end up with a few very busy CPUs spending
most of their time in softirq/ksoftirqd and some idle ones.
Introduce threaded NAPI for such drivers based on a workqueue. The API is the
same except for using netif_threaded_napi_add instead of netif_napi_add.
In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
thread.
With threaded NAPI, throughput seems stable and consistent (and higher than
the best results I got without it).
Based on a patch by Hillf Danton
Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
Changes since PATCH v2:
- Split sysfs attribute into a separate patch
- Take RTNL on attribute show
- make napi_threaded attribute static
Changes since PATCH v1:
- use WQ_SYSFS to make workqueue configurable from user space
- cancel work in netif_napi_del
- add a sysfs file to enable/disable threaded NAPI for a netdev
Changes since RFC v2:
- fix unused but set variable reported by kbuild test robot
Changes since RFC:
- disable softirq around threaded poll functions
- reuse most parts of napi_poll()
- fix re-schedule condition
include/linux/netdevice.h | 23 +++++++
net/core/dev.c | 139 +++++++++++++++++++++++++++-----------
2 files changed, 124 insertions(+), 38 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b0e303f6603f..69507e6d4dc8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
+ struct work_struct work;
};
enum {
@@ -357,6 +358,7 @@ enum {
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_THREADED, /* Use threaded NAPI */
};
enum {
@@ -367,6 +369,7 @@ enum {
NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
};
enum gro_result {
@@ -2327,6 +2330,26 @@ static inline void *netdev_priv(const struct net_device *dev)
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight);
+/**
+ * netif_threaded_napi_add - initialize a NAPI context
+ * @dev: network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: default weight
+ *
+ * This variant of netif_napi_add() should be used from drivers using NAPI
+ * with CPU intensive poll functions.
+ * This will schedule polling from a high priority workqueue
+ */
+static inline void netif_threaded_napi_add(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ netif_napi_add(dev, napi, poll, weight);
+}
+
/**
* netif_tx_napi_add - initialize a NAPI context
* @dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index b5d1129d8310..b6165309617c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -157,6 +157,7 @@ static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
+static struct workqueue_struct *napi_workq __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
@@ -6282,6 +6283,11 @@ void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
+
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
@@ -6329,6 +6335,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
+ if (test_bit(NAPI_STATE_THREADED, &n->state)) {
+ queue_work(napi_workq, &n->work);
+ return;
+ }
+
____napi_schedule(this_cpu_ptr(&softnet_data), n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
@@ -6597,6 +6608,86 @@ static void init_gro_hash(struct napi_struct *napi)
napi->gro_bitmask = 0;
}
+static int __napi_poll(struct napi_struct *n, bool *repoll)
+{
+ int work, weight;
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n, work, weight);
+ }
+
+ if (unlikely(work > weight))
+ pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+ n->poll, work, weight);
+
+ if (likely(work < weight))
+ return work;
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ return work;
+ }
+
+ if (n->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(n, HZ >= 1000);
+ }
+
+ gro_normal_list(n);
+
+ *repoll = true;
+
+ return work;
+}
+
+static void napi_workfn(struct work_struct *work)
+{
+ struct napi_struct *n = container_of(work, struct napi_struct, work);
+ void *have;
+
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(n);
+ __napi_poll(n, &repoll);
+ netpoll_poll_unlock(have);
+
+ local_bh_enable();
+
+ if (!repoll)
+ return;
+
+ if (!need_resched())
+ continue;
+
+ /*
+ * have to pay for the latency of task switch even if
+ * napi is scheduled
+ */
+ queue_work(napi_workq, work);
+ return;
+ }
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6617,6 +6708,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ INIT_WORK(&napi->work, napi_workfn);
set_bit(NAPI_STATE_SCHED, &napi->state);
napi_hash_add(napi);
}
@@ -6655,6 +6747,7 @@ static void flush_gro_hash(struct napi_struct *napi)
void netif_napi_del(struct napi_struct *napi)
{
might_sleep();
+ cancel_work_sync(&napi->work);
if (napi_hash_del(napi))
synchronize_net();
list_del_init(&napi->dev_list);
@@ -6667,53 +6760,19 @@ EXPORT_SYMBOL(netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
+ bool do_repoll = false;
void *have;
- int work, weight;
+ int work;
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
- weight = n->weight;
+ work = __napi_poll(n, &do_repoll);
- /* This NAPI_STATE_SCHED test is for avoiding a race
- * with netpoll's poll_napi(). Only the entity which
- * obtains the lock and sees NAPI_STATE_SCHED set will
- * actually make the ->poll() call. Therefore we avoid
- * accidentally calling ->poll() when NAPI is not scheduled.
- */
- work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- work = n->poll(n, weight);
- trace_napi_poll(n, work, weight);
- }
-
- if (unlikely(work > weight))
- pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
- n->poll, work, weight);
-
- if (likely(work < weight))
+ if (!do_repoll)
goto out_unlock;
- /* Drivers must not modify the NAPI state if they
- * consume the entire weight. In such cases this code
- * still "owns" the NAPI instance and therefore can
- * move the instance around on the list at-will.
- */
- if (unlikely(napi_disable_pending(n))) {
- napi_complete(n);
- goto out_unlock;
- }
-
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
-
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
@@ -10975,6 +11034,10 @@ static int __init net_dev_init(void)
sd->backlog.weight = weight_p;
}
+ napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
+ WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
+ BUG_ON(!napi_workq);
+
dev_boot_phase = 0;
/* The loopback device is special if any other network devices
--
2.28.0
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH v3 2/2] net: add sysfs attribute for enabling threaded NAPI
2020-08-21 19:01 [PATCH v3 1/2] net: add support for threaded NAPI polling Felix Fietkau
@ 2020-08-21 19:01 ` Felix Fietkau
2020-08-22 1:49 ` [PATCH v3 1/2] net: add support for threaded NAPI polling Jakub Kicinski
2020-08-25 16:38 ` Wei Wang
2 siblings, 0 replies; 6+ messages in thread
From: Felix Fietkau @ 2020-08-21 19:01 UTC (permalink / raw)
To: netdev; +Cc: Eric Dumazet, Hillf Danton
This can be used to enable threaded NAPI on drivers that did not explicitly
request it.
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
net/core/net-sysfs.c | 47 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index efec66fa78b7..d7f7df715b60 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -472,6 +472,52 @@ static ssize_t proto_down_store(struct device *dev,
}
NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+static int change_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ struct napi_struct *napi;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (val)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return 0;
+}
+
+static ssize_t napi_threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_napi_threaded);
+}
+
+static ssize_t napi_threaded_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct napi_struct *napi;
+ bool enabled = false;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (test_bit(NAPI_STATE_THREADED, &napi->state))
+ enabled = true;
+ }
+
+ rtnl_unlock();
+
+ return sprintf(buf, fmt_dec, enabled);
+}
+static DEVICE_ATTR_RW(napi_threaded);
+
static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -564,6 +610,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
&dev_attr_napi_defer_hard_irqs.attr,
+ &dev_attr_napi_threaded.attr,
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
--
2.28.0
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3 1/2] net: add support for threaded NAPI polling
2020-08-21 19:01 [PATCH v3 1/2] net: add support for threaded NAPI polling Felix Fietkau
2020-08-21 19:01 ` [PATCH v3 2/2] net: add sysfs attribute for enabling threaded NAPI Felix Fietkau
@ 2020-08-22 1:49 ` Jakub Kicinski
2020-08-22 16:22 ` Eric Dumazet
2020-08-30 8:46 ` Sebastian Gottschall
2020-08-25 16:38 ` Wei Wang
2 siblings, 2 replies; 6+ messages in thread
From: Jakub Kicinski @ 2020-08-22 1:49 UTC (permalink / raw)
To: Felix Fietkau; +Cc: netdev, Eric Dumazet, Hillf Danton
On Fri, 21 Aug 2020 21:01:50 +0200 Felix Fietkau wrote:
> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
> poll function does not perform well. Since NAPI poll is bound to the CPU it
> was scheduled from, we can easily end up with a few very busy CPUs spending
> most of their time in softirq/ksoftirqd and some idle ones.
>
> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
> same except for using netif_threaded_napi_add instead of netif_napi_add.
>
> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
> thread.
>
> With threaded NAPI, throughput seems stable and consistent (and higher than
> the best results I got without it).
>
> Based on a patch by Hillf Danton
I've tested this patch on a non-NUMA system with a moderately
high-network workload (roughly 1:6 network to compute cycles)
- and it provides ~2.5% speedup in terms of RPS but 1/6/10% worse
P50/P99/P999 latency.
I started working on a counter-proposal which uses a pool of threads
dedicated to NAPI polling. It's not unlike the workqueue code but
trying to be a little more clever. It gives me ~6.5% more RPS but at
the same time lowers the p99 latency by 35% without impacting other
percentiles. (I only started testing this afternoon, so hopefully the
numbers will improve further).
I'm happy for this patch to be merged, it's quite nice, but I wanted
to give the heads up that I may have something that would replace it...
The extremely rough PoC, less than half-implemented code which is really
too broken to share:
https://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux.git/log/?h=tapi
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3 1/2] net: add support for threaded NAPI polling
2020-08-22 1:49 ` [PATCH v3 1/2] net: add support for threaded NAPI polling Jakub Kicinski
@ 2020-08-22 16:22 ` Eric Dumazet
2020-08-30 8:46 ` Sebastian Gottschall
1 sibling, 0 replies; 6+ messages in thread
From: Eric Dumazet @ 2020-08-22 16:22 UTC (permalink / raw)
To: Jakub Kicinski, Felix Fietkau; +Cc: netdev, Eric Dumazet, Hillf Danton
On 8/21/20 6:49 PM, Jakub Kicinski wrote:
> On Fri, 21 Aug 2020 21:01:50 +0200 Felix Fietkau wrote:
>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>> was scheduled from, we can easily end up with a few very busy CPUs spending
>> most of their time in softirq/ksoftirqd and some idle ones.
>>
>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>
>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>> thread.
>>
>> With threaded NAPI, throughput seems stable and consistent (and higher than
>> the best results I got without it).
>>
>> Based on a patch by Hillf Danton
>
> I've tested this patch on a non-NUMA system with a moderately
> high-network workload (roughly 1:6 network to compute cycles)
> - and it provides ~2.5% speedup in terms of RPS but 1/6/10% worse
> P50/P99/P999 latency.
>
> I started working on a counter-proposal which uses a pool of threads
> dedicated to NAPI polling. It's not unlike the workqueue code but
> trying to be a little more clever. It gives me ~6.5% more RPS but at
> the same time lowers the p99 latency by 35% without impacting other
> percentiles. (I only started testing this afternoon, so hopefully the
> numbers will improve further).
>
> I'm happy for this patch to be merged, it's quite nice, but I wanted
> to give the heads up that I may have something that would replace it...
>
> The extremely rough PoC, less than half-implemented code which is really
> too broken to share:
> https://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux.git/log/?h=tapi
>
Yes, the idea of sharing a single napi_workq without the ability to perform
some per-queue tuning is probably okay for the class of devices Felix is interested in.
I vote for waiting a bit and see what you can achieve, since Felix showed no intent
to work on using kthreads instead of work queues.
Having one kthread per queue gives us existing instrumentation (sched stats),
and ability to decide for optimal affinities and priorities.
Thanks !
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3 1/2] net: add support for threaded NAPI polling
2020-08-22 1:49 ` [PATCH v3 1/2] net: add support for threaded NAPI polling Jakub Kicinski
2020-08-22 16:22 ` Eric Dumazet
@ 2020-08-30 8:46 ` Sebastian Gottschall
1 sibling, 0 replies; 6+ messages in thread
From: Sebastian Gottschall @ 2020-08-30 8:46 UTC (permalink / raw)
To: Jakub Kicinski, Felix Fietkau; +Cc: netdev, Eric Dumazet, Hillf Danton
Am 22.08.2020 um 03:49 schrieb Jakub Kicinski:
> On Fri, 21 Aug 2020 21:01:50 +0200 Felix Fietkau wrote:
>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>> was scheduled from, we can easily end up with a few very busy CPUs spending
>> most of their time in softirq/ksoftirqd and some idle ones.
>>
>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>
>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>> thread.
>>
>> With threaded NAPI, throughput seems stable and consistent (and higher than
>> the best results I got without it).
>>
>> Based on a patch by Hillf Danton
> I've tested this patch on a non-NUMA system with a moderately
> high-network workload (roughly 1:6 network to compute cycles)
> - and it provides ~2.5% speedup in terms of RPS but 1/6/10% worse
> P50/P99/P999 latency.
>
> I started working on a counter-proposal which uses a pool of threads
> dedicated to NAPI polling. It's not unlike the workqueue code but
> trying to be a little more clever. It gives me ~6.5% more RPS but at
> the same time lowers the p99 latency by 35% without impacting other
> percentiles. (I only started testing this afternoon, so hopefully the
> numbers will improve further).
>
> I'm happy for this patch to be merged, it's quite nice, but I wanted
> to give the heads up that I may have something that would replace it...
>
> The extremely rough PoC, less than half-implemented code which is really
> too broken to share:
> https://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux.git/log/?h=tapi
looks interesting. keep going
Sebastian
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v3 1/2] net: add support for threaded NAPI polling
2020-08-21 19:01 [PATCH v3 1/2] net: add support for threaded NAPI polling Felix Fietkau
2020-08-21 19:01 ` [PATCH v3 2/2] net: add sysfs attribute for enabling threaded NAPI Felix Fietkau
2020-08-22 1:49 ` [PATCH v3 1/2] net: add support for threaded NAPI polling Jakub Kicinski
@ 2020-08-25 16:38 ` Wei Wang
2 siblings, 0 replies; 6+ messages in thread
From: Wei Wang @ 2020-08-25 16:38 UTC (permalink / raw)
To: Felix Fietkau
Cc: Linux Kernel Network Developers, Eric Dumazet, Hillf Danton, Wei Wang
On Fri, Aug 21, 2020 at 12:03 PM Felix Fietkau <nbd@nbd.name> wrote:
>
> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
> poll function does not perform well. Since NAPI poll is bound to the CPU it
> was scheduled from, we can easily end up with a few very busy CPUs spending
> most of their time in softirq/ksoftirqd and some idle ones.
>
> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
> same except for using netif_threaded_napi_add instead of netif_napi_add.
>
> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
> thread.
>
> With threaded NAPI, throughput seems stable and consistent (and higher than
> the best results I got without it).
>
> Based on a patch by Hillf Danton
>
> Cc: Hillf Danton <hdanton@sina.com>
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
> Changes since PATCH v2:
> - Split sysfs attribute into a separate patch
> - Take RTNL on attribute show
> - make napi_threaded attribute static
>
> Changes since PATCH v1:
> - use WQ_SYSFS to make workqueue configurable from user space
> - cancel work in netif_napi_del
> - add a sysfs file to enable/disable threaded NAPI for a netdev
>
> Changes since RFC v2:
> - fix unused but set variable reported by kbuild test robot
>
> Changes since RFC:
> - disable softirq around threaded poll functions
> - reuse most parts of napi_poll()
> - fix re-schedule condition
>
> include/linux/netdevice.h | 23 +++++++
> net/core/dev.c | 139 +++++++++++++++++++++++++++-----------
> 2 files changed, 124 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index b0e303f6603f..69507e6d4dc8 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -347,6 +347,7 @@ struct napi_struct {
> struct list_head dev_list;
> struct hlist_node napi_hash_node;
> unsigned int napi_id;
> + struct work_struct work;
> };
>
> enum {
> @@ -357,6 +358,7 @@ enum {
> NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
> NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
> NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
> + NAPI_STATE_THREADED, /* Use threaded NAPI */
> };
>
> enum {
> @@ -367,6 +369,7 @@ enum {
> NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
> NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
> NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
> + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
> };
>
> enum gro_result {
> @@ -2327,6 +2330,26 @@ static inline void *netdev_priv(const struct net_device *dev)
> void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
> int (*poll)(struct napi_struct *, int), int weight);
>
> +/**
> + * netif_threaded_napi_add - initialize a NAPI context
> + * @dev: network device
> + * @napi: NAPI context
> + * @poll: polling function
> + * @weight: default weight
> + *
> + * This variant of netif_napi_add() should be used from drivers using NAPI
> + * with CPU intensive poll functions.
> + * This will schedule polling from a high priority workqueue
> + */
> +static inline void netif_threaded_napi_add(struct net_device *dev,
> + struct napi_struct *napi,
> + int (*poll)(struct napi_struct *, int),
> + int weight)
> +{
> + set_bit(NAPI_STATE_THREADED, &napi->state);
> + netif_napi_add(dev, napi, poll, weight);
> +}
> +
> /**
> * netif_tx_napi_add - initialize a NAPI context
> * @dev: network device
> diff --git a/net/core/dev.c b/net/core/dev.c
> index b5d1129d8310..b6165309617c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -157,6 +157,7 @@ static DEFINE_SPINLOCK(offload_lock);
> struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
> struct list_head ptype_all __read_mostly; /* Taps */
> static struct list_head offload_base __read_mostly;
> +static struct workqueue_struct *napi_workq __read_mostly;
>
> static int netif_rx_internal(struct sk_buff *skb);
> static int call_netdevice_notifiers_info(unsigned long val,
> @@ -6282,6 +6283,11 @@ void __napi_schedule(struct napi_struct *n)
> {
> unsigned long flags;
>
> + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
> + queue_work(napi_workq, &n->work);
> + return;
> + }
> +
> local_irq_save(flags);
> ____napi_schedule(this_cpu_ptr(&softnet_data), n);
> local_irq_restore(flags);
> @@ -6329,6 +6335,11 @@ EXPORT_SYMBOL(napi_schedule_prep);
> */
> void __napi_schedule_irqoff(struct napi_struct *n)
> {
> + if (test_bit(NAPI_STATE_THREADED, &n->state)) {
> + queue_work(napi_workq, &n->work);
> + return;
> + }
> +
> ____napi_schedule(this_cpu_ptr(&softnet_data), n);
> }
> EXPORT_SYMBOL(__napi_schedule_irqoff);
> @@ -6597,6 +6608,86 @@ static void init_gro_hash(struct napi_struct *napi)
> napi->gro_bitmask = 0;
> }
>
> +static int __napi_poll(struct napi_struct *n, bool *repoll)
> +{
> + int work, weight;
> +
> + weight = n->weight;
> +
> + /* This NAPI_STATE_SCHED test is for avoiding a race
> + * with netpoll's poll_napi(). Only the entity which
> + * obtains the lock and sees NAPI_STATE_SCHED set will
> + * actually make the ->poll() call. Therefore we avoid
> + * accidentally calling ->poll() when NAPI is not scheduled.
> + */
> + work = 0;
> + if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> + work = n->poll(n, weight);
> + trace_napi_poll(n, work, weight);
> + }
> +
> + if (unlikely(work > weight))
> + pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
> + n->poll, work, weight);
> +
> + if (likely(work < weight))
> + return work;
> +
> + /* Drivers must not modify the NAPI state if they
> + * consume the entire weight. In such cases this code
> + * still "owns" the NAPI instance and therefore can
> + * move the instance around on the list at-will.
> + */
> + if (unlikely(napi_disable_pending(n))) {
> + napi_complete(n);
> + return work;
> + }
> +
> + if (n->gro_bitmask) {
> + /* flush too old packets
> + * If HZ < 1000, flush all packets.
> + */
> + napi_gro_flush(n, HZ >= 1000);
> + }
> +
> + gro_normal_list(n);
> +
> + *repoll = true;
> +
> + return work;
> +}
> +
> +static void napi_workfn(struct work_struct *work)
> +{
> + struct napi_struct *n = container_of(work, struct napi_struct, work);
> + void *have;
> +
> + for (;;) {
> + bool repoll = false;
> +
> + local_bh_disable();
> +
> + have = netpoll_poll_lock(n);
> + __napi_poll(n, &repoll);
> + netpoll_poll_unlock(have);
> +
> + local_bh_enable();
> +
> + if (!repoll)
> + return;
> +
> + if (!need_resched())
> + continue;
> +
> + /*
> + * have to pay for the latency of task switch even if
> + * napi is scheduled
> + */
> + queue_work(napi_workq, work);
> + return;
> + }
> +}
> +
> void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
> int (*poll)(struct napi_struct *, int), int weight)
> {
> @@ -6617,6 +6708,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
> #ifdef CONFIG_NETPOLL
> napi->poll_owner = -1;
> #endif
> + INIT_WORK(&napi->work, napi_workfn);
> set_bit(NAPI_STATE_SCHED, &napi->state);
> napi_hash_add(napi);
> }
> @@ -6655,6 +6747,7 @@ static void flush_gro_hash(struct napi_struct *napi)
> void netif_napi_del(struct napi_struct *napi)
> {
> might_sleep();
> + cancel_work_sync(&napi->work);
> if (napi_hash_del(napi))
> synchronize_net();
> list_del_init(&napi->dev_list);
> @@ -6667,53 +6760,19 @@ EXPORT_SYMBOL(netif_napi_del);
>
> static int napi_poll(struct napi_struct *n, struct list_head *repoll)
> {
> + bool do_repoll = false;
> void *have;
> - int work, weight;
> + int work;
>
> list_del_init(&n->poll_list);
>
> have = netpoll_poll_lock(n);
>
> - weight = n->weight;
> + work = __napi_poll(n, &do_repoll);
>
> - /* This NAPI_STATE_SCHED test is for avoiding a race
> - * with netpoll's poll_napi(). Only the entity which
> - * obtains the lock and sees NAPI_STATE_SCHED set will
> - * actually make the ->poll() call. Therefore we avoid
> - * accidentally calling ->poll() when NAPI is not scheduled.
> - */
> - work = 0;
> - if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> - work = n->poll(n, weight);
> - trace_napi_poll(n, work, weight);
> - }
> -
> - if (unlikely(work > weight))
> - pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
> - n->poll, work, weight);
> -
> - if (likely(work < weight))
> + if (!do_repoll)
> goto out_unlock;
>
> - /* Drivers must not modify the NAPI state if they
> - * consume the entire weight. In such cases this code
> - * still "owns" the NAPI instance and therefore can
> - * move the instance around on the list at-will.
> - */
> - if (unlikely(napi_disable_pending(n))) {
> - napi_complete(n);
> - goto out_unlock;
> - }
> -
> - if (n->gro_bitmask) {
> - /* flush too old packets
> - * If HZ < 1000, flush all packets.
> - */
> - napi_gro_flush(n, HZ >= 1000);
> - }
> -
> - gro_normal_list(n);
> -
> /* Some drivers may have called napi_schedule
> * prior to exhausting their budget.
> */
> @@ -10975,6 +11034,10 @@ static int __init net_dev_init(void)
> sd->backlog.weight = weight_p;
> }
>
> + napi_workq = alloc_workqueue("napi_workq", WQ_UNBOUND | WQ_HIGHPRI,
> + WQ_UNBOUND_MAX_ACTIVE | WQ_SYSFS);
I believe WQ_SYSFS should be passed to the 2nd parameter here.
> + BUG_ON(!napi_workq);
> +
> dev_boot_phase = 0;
>
> /* The loopback device is special if any other network devices
> --
> 2.28.0
>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2020-08-30 8:46 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-21 19:01 [PATCH v3 1/2] net: add support for threaded NAPI polling Felix Fietkau
2020-08-21 19:01 ` [PATCH v3 2/2] net: add sysfs attribute for enabling threaded NAPI Felix Fietkau
2020-08-22 1:49 ` [PATCH v3 1/2] net: add support for threaded NAPI polling Jakub Kicinski
2020-08-22 16:22 ` Eric Dumazet
2020-08-30 8:46 ` Sebastian Gottschall
2020-08-25 16:38 ` Wei Wang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).