net: track locally triggered link lossnetdev-carrier-down

A comment above carrier_up_count / carrier_down_count in netdevice.h proudly states: /* Stats to monitor link on/off, flapping */ In reality datacenter NIC drivers introduce quite a bit of noise into those statistics, making them less than ideal for link flap detection. There are 3 types of events counted as carrier changes today: (1) reconfiguration which requires pausing Tx and Rx but doesn't actually result in a link down for the remote end; (2) reconfiguration events which do take the link down; (3a) real PHY-detected link loss due to remote end's actions; (3b) real PHY-detected link loss due to signal integrity issues. (3a and 3b are indistinguishable to local end so counting as one.) Reconfigurations of type (1) are when drivers call netif_carrier_off() / netif_carrier_on() around changes to queues, IRQs, time stamping, XDP enablement etc. In DC scenarios machine provisioning or reallocation causes multiple settings to be changed in close succession. This looks like a spike in link flaps to monitoring systems. Suppressing the fake carrier changes while maintaining the Rx/Tx pause behavior seems hard, and can introduce a divergence in what the kernel thinks about the device (paused) vs what user space thinks (running). Another option would be to expose a link loss statistic which some devices (FW) already maintain. Unfortunately, such stats are not very common (unless my grepping skills fail me). Instead this patch tries to expose a new event count - number of locally caused link changes. Only "down" events are counted because the "up" events are not really in our control. The "real" link flops can be obtained by subtracting the new counter from carrier_down_count. In terms of API - drivers can use netif_carrier_admin_off() hen taking the link down. There's also an API for situations where driver requests link reset but expects the down / up reporting to come thru the usual, async path. It may be worth pointing out that in case of datacenter NICs even with the new statistic we will not be able to distinguish between events (1) and (2), and what follows two Linux boxes connected back-to-back won't be able to isolate events of type (3b). I think that to solve that we'd need yet another counter - carrier_down_local_link_was_really_reset... I think it's okay to leave that as a future extension. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Jakub Kicinski <kuba@kernel.org> 2022-05-18 15:03:19 -0700
committer: Jakub Kicinski <kuba@kernel.org> 2022-05-18 16:40:41 -0700
commit: 7bfffc83e277098fb9ff6849b34cd94007c708d6 (patch)
tree: 0df36ea5451f929c562b1c6bd7d4a0561ab42db9
parent: a3641ca416a3da7cbeae5bcf1fc26ba9797a1438 (diff)
download: linux-netdev-carrier-down.tar.gz
4 files changed, 52 insertions, 4 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cbaf312e365ba..a874d6f7c1ebf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -310,6 +310,7 @@ enum netdev_state_t {
 	__LINK_STATE_LINKWATCH_PENDING,
 	__LINK_STATE_DORMANT,
 	__LINK_STATE_TESTING,
+	__LINK_STATE_NOCARRIER_LOCAL,
 };
 
 struct gro_list {
@@ -1766,6 +1767,8 @@ enum netdev_ml_priv_type {
  *			do not use this in drivers
  *	@carrier_up_count:	Number of times the carrier has been up
  *	@carrier_down_count:	Number of times the carrier has been down
+ *	@carrier_down_local:	Number of times the carrier has been counted as
+ *				down due to local administrative actions
  *
  *	@wireless_handlers:	List of functions to handle Wireless Extensions,
  *				instead of ioctl,
@@ -2055,6 +2058,7 @@ struct net_device {
 	/* Stats to monitor link on/off, flapping */
 	atomic_t		carrier_up_count;
 	atomic_t		carrier_down_count;
+	unsigned int		carrier_down_local;
 
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *wireless_handlers;
@@ -4059,8 +4063,27 @@ unsigned long dev_trans_start(struct net_device *dev);
 
 void __netdev_watchdog_up(struct net_device *dev);
 
+/**
+ * netif_carrier_local_changes_start() - enter local link reconfiguration
+ * @dev: network device
+ *
+ * Mark link as unstable due to local administrative actions. This will
+ * cause netif_carrier_off() to behave like netif_carrier_admin_off() until
+ * netif_carrier_local_changes_end() is called.
+ */
+static inline void netif_carrier_local_changes_start(struct net_device *dev)
+{
+	set_bit(__LINK_STATE_NOCARRIER_LOCAL, &dev->state);
+}
+
+static inline void netif_carrier_local_changes_end(struct net_device *dev)
+{
+	clear_bit(__LINK_STATE_NOCARRIER_LOCAL, &dev->state);
+}
+
 void netif_carrier_on(struct net_device *dev);
 void netif_carrier_off(struct net_device *dev);
+void netif_carrier_admin_off(struct net_device *dev);
 void netif_carrier_event(struct net_device *dev);
 
 /**
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 5f58dcfe2787f..b53e17e6f2eaf 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -370,6 +370,7 @@ enum {
 	IFLA_GRO_MAX_SIZE,
 	IFLA_TSO_MAX_SIZE,
 	IFLA_TSO_MAX_SEGS,
+	IFLA_CARRIER_DOWN_LOCAL,
 
 	__IFLA_MAX
 };
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ac45328607f77..2303769632668 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1092,6 +1092,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_MAX_MTU */
 	       + rtnl_prop_list_size(dev)
 	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+	       + nla_total_size(4) /* IFLA_CARRIER_DOWN_LOCAL */
 	       + 0;
 }
 
@@ -1787,7 +1788,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	    nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
 			atomic_read(&dev->carrier_up_count)) ||
 	    nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
-			atomic_read(&dev->carrier_down_count)))
+			atomic_read(&dev->carrier_down_count)) ||
+	    nla_put_u32(skb, IFLA_CARRIER_DOWN_LOCAL, dev->carrier_down_local))
 		goto nla_put_failure;
 
 	if (rtnl_fill_proto_down(skb, dev))
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index dba0b3e24af5e..6519dce7817d5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -589,23 +589,45 @@ void netif_carrier_on(struct net_device *dev)
 EXPORT_SYMBOL(netif_carrier_on);
 
 /**
- *	netif_carrier_off - clear carrier
- *	@dev: network device
+ * netif_carrier_off() - clear carrier in response to a true link state event
+ * @dev: network device
  *
- * Device has detected loss of carrier.
+ * Clear carrier and stop Tx, use when device has detected loss of carrier.
  */
 void netif_carrier_off(struct net_device *dev)
 {
+	bool admin = test_bit(__LINK_STATE_NOCARRIER_LOCAL, &dev->state);
+
 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
 		if (dev->reg_state == NETREG_UNINITIALIZED)
 			return;
 		atomic_inc(&dev->carrier_down_count);
+		if (admin)
+			WRITE_ONCE(dev->carrier_down_local,
+				   dev->carrier_down_local + 1);
 		linkwatch_fire_event(dev);
 	}
 }
 EXPORT_SYMBOL(netif_carrier_off);
 
 /**
+ * netif_carrier_admin_off() - clear carrier to reconfigure the device
+ * @dev: network device
+ *
+ * Force the carrier down, e.g. to perform device reconfiguration,
+ * reset the device after an error etc. This function should be used
+ * instead of netif_carrier_off() any time carrier goes down for reasons
+ * other that an actual link layer connection loss.
+ */
+void netif_carrier_admin_off(struct net_device *dev)
+{
+	netif_carrier_local_changes_start(dev);
+	netif_carrier_off(dev);
+	netif_carrier_local_changes_end(dev);
+}
+EXPORT_SYMBOL(netif_carrier_admin_off);
+
+/**
  *	netif_carrier_event - report carrier state event
  *	@dev: network device
  *
author	Jakub Kicinski <kuba@kernel.org>	2022-05-18 15:03:19 -0700
committer	Jakub Kicinski <kuba@kernel.org>	2022-05-18 16:40:41 -0700
commit	7bfffc83e277098fb9ff6849b34cd94007c708d6 (patch)
tree	0df36ea5451f929c562b1c6bd7d4a0561ab42db9
parent	a3641ca416a3da7cbeae5bcf1fc26ba9797a1438 (diff)
download	linux-netdev-carrier-down.tar.gz