aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Ahern <dsahern@kernel.org>2024-01-22 03:44:13 +0000
committerDavid Ahern <dsahern@kernel.org>2024-01-22 03:44:13 +0000
commit4a6c579ae947cc08dcc12173a040b474c506923e (patch)
tree38875bb9fa8df6a1e77bf9c3ba35d188368ab39a
parent97d3edd74383628a01d7d6a1197196dd500d5357 (diff)
parent3062aaf77027c69d1ab34c3483522f195c8856ad (diff)
downloadiproute2-4a6c579ae947cc08dcc12173a040b474c506923e.tar.gz
Merge branch 'main' into next
Signed-off-by: David Ahern <dsahern@kernel.org>
-rw-r--r--doc/actions/actions-general256
-rw-r--r--doc/actions/gact-usage78
-rw-r--r--doc/actions/ifb-README125
-rw-r--r--doc/actions/mirred-usage164
-rw-r--r--genl/ctrl.c2
-rw-r--r--include/uapi/linux/pkt_cls.h51
-rw-r--r--include/uapi/linux/pkt_sched.h109
-rw-r--r--include/uapi/linux/tc_act/tc_ipt.h20
-rw-r--r--ip/iplink_xstats.c2
-rw-r--r--man/man8/tc-bfifo.82
-rw-r--r--man/man8/tc-gact.885
-rw-r--r--man/man8/tc-mirred.88
-rw-r--r--man/man8/tc-pfifo_fast.84
-rw-r--r--man/man8/tc.81
-rw-r--r--misc/ss.c2
-rw-r--r--tc/m_gate.c41
-rw-r--r--tc/q_etf.c43
-rw-r--r--tc/q_taprio.c43
-rw-r--r--tc/tc_util.c54
-rw-r--r--tc/tc_util.h4
20 files changed, 158 insertions, 936 deletions
diff --git a/doc/actions/actions-general b/doc/actions/actions-general
deleted file mode 100644
index a0074a58c..000000000
--- a/doc/actions/actions-general
+++ /dev/null
@@ -1,256 +0,0 @@
-
-This documented is slightly dated but should give you idea of how things
-work.
-
-What is it?
------------
-
-An extension to the filtering/classification architecture of Linux Traffic
-Control.
-Up to 2.6.8 the only action that could be "attached" to a filter was policing.
-i.e you could say something like:
-
------
-tc filter add dev lo parent ffff: protocol ip prio 10 u32 match ip src \
-127.0.0.1/32 flowid 1:1 police mtu 4000 rate 1500kbit burst 90k
------
-
-which implies "if a packet is seen on the ingress of the lo device with
-a source IP address of 127.0.0.1/32 we give it a classification id of 1:1 and
-we execute a policing action which rate limits its bandwidth utilization
-to 1.5Mbps".
-
-The new extensions allow for more than just policing actions to be added.
-They are also fully backward compatible. If you have a kernel that doesn't
-understand them, then the effect is null i.e if you have a newer tc
-but older kernel, the actions are not installed. Likewise if you
-have a newer kernel but older tc, obviously the tc will use current
-syntax which will work fine. Of course to get the required effect you need
-both newer tc and kernel. If you are reading this you have the
-right tc ;->
-
-A side effect is that we can now get stateless firewalling to work with tc.
-Essentially this is now an alternative to iptables.
-I won't go into details of my dislike for iptables at times, but
-scalability is one of the main issues; however, if you need stateful
-classification - use netfilter (for now).
-
-This stuff works on both ingress and egress qdiscs.
-
-Features
---------
-
-1) new additional syntax and actions enabled. Note old syntax is still valid.
-
-Essentially this is still the same syntax as tc with a new construct
-"action". The syntax is of the form:
-tc filter add <DEVICE> parent 1:0 protocol ip prio 10 <Filter description>
-flowid 1:1 action <ACTION description>*
-
-You can have as many actions as you want (within sensible reasoning).
-
-In the past the only real action was the policer; i.e you could do something
-along the lines of:
-tc filter add dev lo parent ffff: protocol ip prio 10 u32 \
-match ip src 127.0.0.1/32 flowid 1:1 \
-police mtu 4000 rate 1500kbit burst 90k
-
-Although you can still use the same syntax, now you can say:
-
-tc filter add dev lo parent 1:0 protocol ip prio 10 u32 \
-match ip src 127.0.0.1/32 flowid 1:1 \
-action police mtu 4000 rate 1500kbit burst 90k
-
-" generic Actions" (gact) at the moment are:
-{ drop, pass, reclassify, continue}
-(If you have others, no listed here give me a reason and we will add them)
-+drop says to drop the packet
-+pass and ok (are equivalent) says to accept it
-+reclassify requests for reclassification of the packet
-+continue requests for next lookup to match
-
-2)In order to take advantage of some of the targets written by the
-iptables people, a classifier can have a packet being massaged by an
-iptable target. I have only tested with mangler targets up to now.
-(in fact anything that is not in the mangling table is disabled right now)
-
-In terms of hooks:
-*ingress is mapped to pre-routing hook
-*egress is mapped to post-routing hook
-I don't see much value in the other hooks, if you see it and email me good
-reasons, the addition is trivial.
-
-Example syntax for iptables targets usage becomes:
-tc filter add ..... u32 <u32 syntax> action ipt -j <iptables target syntax>
-
-example:
-tc filter add dev lo parent ffff: protocol ip prio 8 u32 \
-match ip dst 127.0.0.8/32 flowid 1:12 \
-action ipt -j mark --set-mark 2
-
-NOTE: flowid 1:12 is parsed flowid 0x1:0x12. Make sure if you want flowid
-decimal 12, then use flowid 1:c.
-
-3) A feature i call pipe
-The motivation is derived from Unix pipe mechanism but applied to packets.
-Essentially take a matching packet and pass it through
-action1 | action2 | action3 etc.
-You could do something similar to this with the tc policer and the "continue"
-operator but this rather restricts it to just the policer and requires
-multiple rules (and lookups, hence quiet inefficient);
-
-as an example -- and please note that this is just an example _not_ The
-Word Youve Been Waiting For (yes i have had problems giving examples
-which ended becoming dogma in documents and people modifying them a little
-to look clever);
-
-i selected the metering rates to be small so that i can show better how
-things work.
-
-The script below does the following:
-- an incoming packet from 10.0.0.21 is first given a firewall mark of 1.
-
-- It is then metered to make sure it does not exceed its allocated rate of
-1Kbps. If it doesn't exceed rate, this is where we terminate action execution.
-
-- If it does exceed its rate, its "color" changes to a mark of 2 and it is
-then passed through a second meter.
-
--The second meter is shared across all flows on that device [i am surpised
-that this seems to be not a well know feature of the policer; Bert was telling
-me that someone was writing a qdisc just to do sharing across multiple devices;
-it must be the summer heat again; weve had someone doing that every year around
-summer -- the key to sharing is to use a operator "index" in your policer
-rules (example "index 20"). All your rules have to use the same index to
-share.]
-
--If the second meter is exceeded the color of the flow changes further to 3.
-
--We then pass the packet to another meter which is shared across all devices
-in the system. If this meter is exceeded we drop the packet.
-
-Note the mark can be used further up the system to do things like policy
-or more interesting things on the egress.
-
------------------- cut here -------------------------------
-#
-# Add an ingress qdisc on eth0
-tc qdisc add dev eth0 ingress
-#
-#if you see an incoming packet from 10.0.0.21
-tc filter add dev eth0 parent ffff: protocol ip prio 1 \
-u32 match ip src 10.0.0.21/32 flowid 1:15 \
-#
-# first give it a mark of 1
-action ipt -j mark --set-mark 1 index 2 \
-#
-# then pass it through a policer which allows 1kbps; if the flow
-# doesn't exceed that rate, this is where we stop, if it exceeds we
-# pipe the packet to the next action
-action police rate 1kbit burst 9k pipe \
-#
-# which marks the packet fwmark as 2 and pipes
-action ipt -j mark --set-mark 2 \
-#
-# next attempt to borrow b/width from a meter
-# used across all flows incoming on eth0("index 30")
-# and if that is exceeded we pipe to the next action
-action police index 30 mtu 5000 rate 1kbit burst 10k pipe \
-# mark it as fwmark 3 if exceeded
-action ipt -j mark --set-mark 3 \
-# and then attempt to borrow from a meter used by all devices in the
-# system. Should this be exceeded, drop the packet on the floor.
-action police index 20 mtu 5000 rate 1kbit burst 90k drop
----------------------------------
-
-Now lets see the actions installed with
-"tc filter show parent ffff: dev eth0"
-
--------- output -----------
-jroot# tc filter show parent ffff: dev eth0
-filter protocol ip pref 1 u32
-filter protocol ip pref 1 u32 fh 800: ht divisor 1
-filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:15
-
- action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
- target MARK set 0x1 index 2
-
- action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb
-
- action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING
- target MARK set 0x2 index 1
-
- action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b
-
- action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING
- target MARK set 0x3 index 3
-
- action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b
-
- match 0a000015/ffffffff at 12
--------------------------------
-
-Note the ordering of the actions is based on the order in which we entered
-them. In the future i will add explicit priorities.
-
-Now lets run a ping -f from 10.0.0.21 to this host; stop the ping after
-you see a few lines of dots
-
-----
-[root@jzny hadi]# ping -f 10.0.0.22
-PING 10.0.0.22 (10.0.0.22): 56 data bytes
-....................................................................................................................................................................................................................................................................................................................................................................................................................................................
---- 10.0.0.22 ping statistics ---
-2248 packets transmitted, 1811 packets received, 19% packet loss
-round-trip min/avg/max = 0.7/9.3/20.1 ms
------------------------------
-
-Now lets take a look at the stats with "tc -s filter show parent ffff: dev eth0"
-
---------------
-jroot# tc -s filter show parent ffff: dev eth0
-filter protocol ip pref 1 u32
-filter protocol ip pref 1 u32 fh 800: ht divisor 1
-filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
-5
-
- action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
- target MARK set 0x1 index 2
- Sent 188832 bytes 2248 pkts (dropped 0, overlimits 0)
-
- action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb
- Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122)
-
- action order 3: tablename: mangle hook: NF_IP_PRE_ROUTING
- target MARK set 0x2 index 1
- Sent 178248 bytes 2122 pkts (dropped 0, overlimits 0)
-
- action order 4: police 30 action pipe rate 1Kbit burst 10Kb mtu 5000b
- Sent 178248 bytes 2122 pkts (dropped 0, overlimits 1945)
-
- action order 5: tablename: mangle hook: NF_IP_PRE_ROUTING
- target MARK set 0x3 index 3
- Sent 163380 bytes 1945 pkts (dropped 0, overlimits 0)
-
- action order 6: police 20 action drop rate 1Kbit burst 90Kb mtu 5000b
- Sent 163380 bytes 1945 pkts (dropped 0, overlimits 437)
-
- match 0a000015/ffffffff at 12
--------------------------------
-
-Neat, eh?
-
-
-Want to write an action module?
-------------------------------
-Its easy. Either look at the code or send me email. I will document at
-some point; will also accept documentation.
-
-TODO
-----
-
-Lotsa goodies/features coming. Requests also being accepted.
-At the moment the focus has been on getting the architecture in place.
-Expect new things in the spurious time i have to work on this
-(particularly around end of year when i have typically get time off
-from work).
diff --git a/doc/actions/gact-usage b/doc/actions/gact-usage
deleted file mode 100644
index 7cf48abbd..000000000
--- a/doc/actions/gact-usage
+++ /dev/null
@@ -1,78 +0,0 @@
-
-gact <ACTION> [RAND] [INDEX]
-
-Where:
- ACTION := reclassify | drop | continue | pass | ok
- RAND := random <RANDTYPE> <ACTION> <VAL>
- RANDTYPE := netrand | determ
- VAL : = value not exceeding 10000
- INDEX := index value used
-
-ACTION semantics
-- pass and ok are equivalent to accept
-- continue allows one to restart classification lookup
-- drop drops packets
-- reclassify implies continue classification where we left off
-
-randomization
---------------
-
-At the moment there are only two algorithms. One is deterministic
-and the other uses internal kernel netrand.
-
-Examples:
-
-Rules can be installed on both ingress and egress - this shows ingress
-only
-
-tc qdisc add dev eth0 ingress
-
-# example 1
-tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
-10.0.0.9/32 flowid 1:16 action drop
-
-ping -c 20 10.0.0.9
-
---
-filter u32
-filter u32 fh 800: ht divisor 1
-filter u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 32 success 20)
- match 0a000009/ffffffff at 12 (success 20 )
- action order 1: gact action drop
- random type none pass val 0
- index 1 ref 1 bind 1 installed 59 sec used 35 sec
- Sent 1680 bytes 20 pkts (dropped 20, overlimits 0 )
-
-----
-
-# example 2
-#allow 1 out 10 randomly using the netrand generator
-tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
-10.0.0.9/32 flowid 1:16 action drop random netrand ok 10
-
-ping -c 20 10.0.0.9
-
-----
-filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1filter protocol ip pref 6 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 20 success 20)
- match 0a000009/ffffffff at 12 (success 20 )
- action order 1: gact action drop
- random type netrand pass val 10
- index 5 ref 1 bind 1 installed 49 sec used 25 sec
- Sent 1680 bytes 20 pkts (dropped 16, overlimits 0 )
-
---------
-#alternative: deterministically accept every second packet
-tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
-10.0.0.9/32 flowid 1:16 action drop random determ ok 2
-
-ping -c 20 10.0.0.9
-
-tc -s filter show parent ffff: dev eth0
------
-filter protocol ip pref 6 u32 filter protocol ip pref 6 u32 fh 800: ht divisor 1filter protocol ip pref 6 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:16 (rule hit 20 success 20)
- match 0a000009/ffffffff at 12 (success 20 )
- action order 1: gact action drop
- random type determ pass val 2
- index 4 ref 1 bind 1 installed 118 sec used 82 sec
- Sent 1680 bytes 20 pkts (dropped 10, overlimits 0 )
------
diff --git a/doc/actions/ifb-README b/doc/actions/ifb-README
deleted file mode 100644
index 5fe917146..000000000
--- a/doc/actions/ifb-README
+++ /dev/null
@@ -1,125 +0,0 @@
-
-IFB is intended to replace IMQ.
-Advantage over current IMQ; cleaner in particular in in SMP;
-with a _lot_ less code.
-
-Known IMQ/IFB USES
-------------------
-
-As far as i know the reasons listed below is why people use IMQ.
-It would be nice to know of anything else that i missed.
-
-1) qdiscs/policies that are per device as opposed to system wide.
-IFB allows for sharing.
-
-2) Allows for queueing incoming traffic for shaping instead of
-dropping. I am not aware of any study that shows policing is
-worse than shaping in achieving the end goal of rate control.
-I would be interested if anyone is experimenting.
-
-3) Very interesting use: if you are serving p2p you may want to give
-preference to your own locally originated traffic (when responses come back)
-vs someone using your system to do bittorent. So QoSing based on state
-comes in as the solution. What people did to achieve this was stick
-the IMQ somewhere prelocal hook.
-I think this is a pretty neat feature to have in Linux in general.
-(i.e not just for IMQ).
-But i won't go back to putting netfilter hooks in the device to satisfy
-this. I also don't think its worth it hacking ifb some more to be
-aware of say L3 info and play ip rule tricks to achieve this.
---> Instead the plan is to have a conntrack related action. This action will
-selectively either query/create conntrack state on incoming packets.
-Packets could then be redirected to ifb based on what happens -> eg
-on incoming packets; if we find they are of known state we could send to
-a different queue than one which didn't have existing state. This
-all however is dependent on whatever rules the admin enters.
-
-At the moment this 3rd function does not exist yet. I have decided that
-instead of sitting on the patch for another year, to release it and then
-if there is pressure i will add this feature.
-
-An example, to provide functionality that most people use IMQ for below:
-
---------
-export TC="/sbin/tc"
-
-$TC qdisc add dev ifb0 root handle 1: prio
-$TC qdisc add dev ifb0 parent 1:1 handle 10: sfq
-$TC qdisc add dev ifb0 parent 1:2 handle 20: tbf rate 20kbit buffer 1600 limit 3000
-$TC qdisc add dev ifb0 parent 1:3 handle 30: sfq
-$TC filter add dev ifb0 protocol ip pref 1 parent 1: handle 1 fw classid 1:1
-$TC filter add dev ifb0 protocol ip pref 2 parent 1: handle 2 fw classid 1:2
-
-ifconfig ifb0 up
-
-$TC qdisc add dev eth0 ingress
-
-# redirect all IP packets arriving in eth0 to ifb0
-# use mark 1 --> puts them onto class 1:1
-$TC filter add dev eth0 parent ffff: protocol ip prio 10 u32 \
-match u32 0 0 flowid 1:1 \
-action ipt -j MARK --set-mark 1 \
-action mirred egress redirect dev ifb0
-
---------
-
-
-Run A Little test:
-
-from another machine ping so that you have packets going into the box:
------
-[root@jzny action-tests]# ping 10.22
-PING 10.22 (10.0.0.22): 56 data bytes
-64 bytes from 10.0.0.22: icmp_seq=0 ttl=64 time=2.8 ms
-64 bytes from 10.0.0.22: icmp_seq=1 ttl=64 time=0.6 ms
-64 bytes from 10.0.0.22: icmp_seq=2 ttl=64 time=0.6 ms
-
---- 10.22 ping statistics ---
-3 packets transmitted, 3 packets received, 0% packet loss
-round-trip min/avg/max = 0.6/1.3/2.8 ms
-[root@jzny action-tests]#
------
-Now look at some stats:
-
----
-[root@jmandrake]:~# $TC -s filter show parent ffff: dev eth0
-filter protocol ip pref 10 u32
-filter protocol ip pref 10 u32 fh 800: ht divisor 1
-filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
- match 00000000/00000000 at 0
- action order 1: tablename: mangle hook: NF_IP_PRE_ROUTING
- target MARK set 0x1
- index 1 ref 1 bind 1 installed 4195sec used 27sec
- Sent 252 bytes 3 pkts (dropped 0, overlimits 0)
-
- action order 2: mirred (Egress Redirect to device ifb0) stolen
- index 1 ref 1 bind 1 installed 165 sec used 27 sec
- Sent 252 bytes 3 pkts (dropped 0, overlimits 0)
-
-[root@jmandrake]:~# $TC -s qdisc
-qdisc sfq 30: dev ifb0 limit 128p quantum 1514b
- Sent 0 bytes 0 pkts (dropped 0, overlimits 0)
-qdisc tbf 20: dev ifb0 rate 20Kbit burst 1575b lat 2147.5s
- Sent 210 bytes 3 pkts (dropped 0, overlimits 0)
-qdisc sfq 10: dev ifb0 limit 128p quantum 1514b
- Sent 294 bytes 3 pkts (dropped 0, overlimits 0)
-qdisc prio 1: dev ifb0 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
- Sent 504 bytes 6 pkts (dropped 0, overlimits 0)
-qdisc ingress ffff: dev eth0 ----------------
- Sent 308 bytes 5 pkts (dropped 0, overlimits 0)
-
-[root@jmandrake]:~# ifconfig ifb0
-ifb0 Link encap:Ethernet HWaddr 00:00:00:00:00:00
- inet6 addr: fe80::200:ff:fe00:0/64 Scope:Link
- UP BROADCAST RUNNING NOARP MTU:1500 Metric:1
- RX packets:6 errors:0 dropped:3 overruns:0 frame:0
- TX packets:3 errors:0 dropped:0 overruns:0 carrier:0
- collisions:0 txqueuelen:32
- RX bytes:504 (504.0 b) TX bytes:252 (252.0 b)
------
-
-You send it any packet not originating from the actions it will drop them.
-[In this case the three dropped packets were ipv6 ndisc].
-
-cheers,
-jamal
diff --git a/doc/actions/mirred-usage b/doc/actions/mirred-usage
deleted file mode 100644
index 482ff66d6..000000000
--- a/doc/actions/mirred-usage
+++ /dev/null
@@ -1,164 +0,0 @@
-
-Very funky action. I do plan to add to a few more things to it
-This is the basic stuff. Idea borrowed from the way ethernet switches
-mirror and redirect packets. The main difference with say a vannila
-ethernet switch is that you can use u32 classifier to select a
-flow to be mirrored. High end switches typically can select based
-on more than just a port (eg a 5 tuple classifier). They may also be
-capable of redirecting.
-
-Usage:
-
-mirred <DIRECTION> <ACTION> [index INDEX] <dev DEVICENAME>
-where:
-DIRECTION := <ingress | egress>
-ACTION := <mirror | redirect>
-INDEX is the specific policy instance id
-DEVICENAME is the devicename
-
-Direction:
-- Ingress is not supported at the moment. It will be in the
-future as well as mirror/redirecting to a socket.
-
-Action:
-- Mirror takes a copy of the packet and sends it to specified
-dev ("port" in ethernet switch/bridging terminology)
-- redirect
-steals the packet and redirects to specified destination dev.
-
-What NOT to do if you don't want your machine to crash:
-------------------------------------------------------
-
-Do not create loops!
-Loops are not hard to create in the egress qdiscs.
-
-Here are simple rules to follow if you don't want to get
-hurt:
-A) Do not have the same packet go to same netdevice twice
-in a single graph of policies. Your machine will just hang!
-This is design intent _not a bug_ to teach you some lessons.
-
-In the future if there are easy ways to do this in the kernel
-without affecting other packets not interested in this feature
-I will add them. At the moment that is not clear.
-
-Some examples of bad things NOT to do:
-1) redirecting eth0 to eth0
-2) eth0->eth1-> eth0
-3) eth0->lo-> eth1-> eth0
-
-B) Do not redirect from one IFB device to another.
-Remember that IFB is a very specialized case of packet redirecting
-device. Instead of redirecting it puts packets at the exact spot
-on the stack it found them from.
-Redirecting from ifbX->ifbY will actually not crash your machine but your
-packets will all be dropped (this is much simpler to detect
-and resolve and is only affecting users of ifb as opposed to the
-whole stack).
-
-In the case of A) the problem has to do with a recursive contention
-for the devices queue lock and in the second case for the transmit lock.
-
-Some examples:
--------------
-
-1) Mirror all packets arriving on eth0 to be sent out on eth1.
-You may have a sniffer or some accounting box hooked up on eth1.
-
----
-tc qdisc add dev eth0 ingress
-tc filter add dev eth0 parent ffff: protocol ip prio 10 u32 \
-match u32 0 0 flowid 1:2 action mirred egress mirror dev eth1
----
-
-If you replace "mirror" with "redirect" then not a copy but rather
-the original packet is sent to eth1.
-
-2) Host A is hooked up to us on eth0
-
-# redirect all packets arriving on ingress of lo to eth0
----
-tc qdisc add dev lo ingress
-tc filter add dev lo parent ffff: protocol ip prio 10 u32 \
-match u32 0 0 flowid 1:2 action mirred egress redirect dev eth0
----
-
-On host A start a tcpdump on interface connecting to us.
-
-on our host ping -c 2 127.0.0.1
-
-Ping would fail since all packets are heading out eth0
-tcpudmp on host A would show them
-
-if you substitute the redirect with mirror above as in:
-tc filter add dev lo parent ffff: protocol ip prio 10 u32 \
-match u32 0 0 flowid 1:2 action mirred egress mirror dev eth0
-
-Then you should see the packets on both host A and the local
-stack (i.e ping would work).
-
-3) Even more funky example:
-
-#
-#allow 1 out 10 packets on ingress of lo to randomly make it to the
-# host A (Randomness uses the netrand generator)
-#
----
-tc filter add dev lo parent ffff: protocol ip prio 10 u32 \
-match u32 0 0 flowid 1:2 \
-action drop random determ ok 10\
-action mirred egress mirror dev eth0
----
-
-4)
-# for packets from 10.0.0.9 going out on eth0 (could be local
-# IP or something # we are forwarding) -
-# if exceeding a 100Kbps rate, then redirect to eth1
-#
-
----
-tc qdisc add dev eth0 handle 1:0 root prio
-tc filter add dev eth0 parent 1:0 protocol ip prio 6 u32 \
-match ip src 10.0.0.9/32 flowid 1:16 \
-action police rate 100kbit burst 90k ok \
-action mirred egress mirror dev eth1
----
-
-A more interesting example is when you mirror flows to a dummy device
-so you could tcpdump them (dummy by defaults drops all packets it sees).
-This is a very useful debug feature.
-
-Lets say you are policing packets from alias 192.168.200.200/32
-you don't want those to exceed 100kbps going out.
-
----
-tc qdisc add dev eth0 handle 1:0 root prio
-tc filter add dev eth0 parent 1: protocol ip prio 10 u32 \
-match ip src 192.168.200.200/32 flowid 1:2 \
-action police rate 100kbit burst 90k drop
----
-
-If you run tcpdump on eth0 you will see all packets going out
-with src 192.168.200.200/32 dropped or not (since tcpdump shows
-all packets being egressed).
-Extend the rule a little to see only the packets making it out.
-
----
-tc qdisc add dev eth0 handle 1:0 root prio
-tc filter add dev eth0 parent 1: protocol ip prio 10 u32 \
-match ip src 192.168.200.200/32 flowid 1:2 \
-action police rate 10kbit burst 90k drop \
-action mirred egress mirror dev dummy0
----
-
-Now fire tcpdump on dummy0 to see only those packets ..
-tcpdump -n -i dummy0 -x -e -t
-
-Essentially a good debugging/logging interface (sort of like
-BSDs speacialized log device does without needing one).
-
-If you replace mirror with redirect, those packets will be
-blackholed and will never make it out.
-
-cheers,
-jamal
diff --git a/genl/ctrl.c b/genl/ctrl.c
index d5b765cca..aff922a43 100644
--- a/genl/ctrl.c
+++ b/genl/ctrl.c
@@ -329,7 +329,7 @@ static int ctrl_listen(int argc, char **argv)
struct rtnl_handle rth;
if (rtnl_open_byproto(&rth, nl_mgrp(GENL_ID_CTRL), NETLINK_GENERIC) < 0) {
- fprintf(stderr, "Canot open generic netlink socket\n");
+ fprintf(stderr, "Cannot open generic netlink socket\n");
return -1;
}
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index c7082cc60..ea277039f 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -99,7 +99,7 @@ enum {
* versions.
*/
#define TCA_ACT_GACT 5
-#define TCA_ACT_IPT 6
+#define TCA_ACT_IPT 6 /* obsoleted, can be reused */
#define TCA_ACT_PEDIT 7
#define TCA_ACT_MIRRED 8
#define TCA_ACT_NAT 9
@@ -120,7 +120,7 @@ enum tca_id {
TCA_ID_UNSPEC = 0,
TCA_ID_POLICE = 1,
TCA_ID_GACT = TCA_ACT_GACT,
- TCA_ID_IPT = TCA_ACT_IPT,
+ TCA_ID_IPT = TCA_ACT_IPT, /* Obsoleted, can be reused */
TCA_ID_PEDIT = TCA_ACT_PEDIT,
TCA_ID_MIRRED = TCA_ACT_MIRRED,
TCA_ID_NAT = TCA_ACT_NAT,
@@ -280,37 +280,6 @@ struct tc_u32_pcnt {
#define TC_U32_MAXDEPTH 8
-
-/* RSVP filter */
-
-enum {
- TCA_RSVP_UNSPEC,
- TCA_RSVP_CLASSID,
- TCA_RSVP_DST,
- TCA_RSVP_SRC,
- TCA_RSVP_PINFO,
- TCA_RSVP_POLICE,
- TCA_RSVP_ACT,
- __TCA_RSVP_MAX
-};
-
-#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 )
-
-struct tc_rsvp_gpi {
- __u32 key;
- __u32 mask;
- int offset;
-};
-
-struct tc_rsvp_pinfo {
- struct tc_rsvp_gpi dpi;
- struct tc_rsvp_gpi spi;
- __u8 protocol;
- __u8 tunnelid;
- __u8 tunnelhdr;
- __u8 pad;
-};
-
/* ROUTE filter */
enum {
@@ -341,22 +310,6 @@ enum {
#define TCA_FW_MAX (__TCA_FW_MAX - 1)
-/* TC index filter */
-
-enum {
- TCA_TCINDEX_UNSPEC,
- TCA_TCINDEX_HASH,
- TCA_TCINDEX_MASK,
- TCA_TCINDEX_SHIFT,
- TCA_TCINDEX_FALL_THROUGH,
- TCA_TCINDEX_CLASSID,
- TCA_TCINDEX_POLICE,
- TCA_TCINDEX_ACT,
- __TCA_TCINDEX_MAX
-};
-
-#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1)
-
/* Flow filter */
enum {
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f762a10bf..a3cd0c2dc 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -477,115 +477,6 @@ enum {
#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1)
-
-/* CBQ section */
-
-#define TC_CBQ_MAXPRIO 8
-#define TC_CBQ_MAXLEVEL 8
-#define TC_CBQ_DEF_EWMA 5
-
-struct tc_cbq_lssopt {
- unsigned char change;
- unsigned char flags;
-#define TCF_CBQ_LSS_BOUNDED 1
-#define TCF_CBQ_LSS_ISOLATED 2
- unsigned char ewma_log;
- unsigned char level;
-#define TCF_CBQ_LSS_FLAGS 1
-#define TCF_CBQ_LSS_EWMA 2
-#define TCF_CBQ_LSS_MAXIDLE 4
-#define TCF_CBQ_LSS_MINIDLE 8
-#define TCF_CBQ_LSS_OFFTIME 0x10
-#define TCF_CBQ_LSS_AVPKT 0x20
- __u32 maxidle;
- __u32 minidle;
- __u32 offtime;
- __u32 avpkt;
-};
-
-struct tc_cbq_wrropt {
- unsigned char flags;
- unsigned char priority;
- unsigned char cpriority;
- unsigned char __reserved;
- __u32 allot;
- __u32 weight;
-};
-
-struct tc_cbq_ovl {
- unsigned char strategy;
-#define TC_CBQ_OVL_CLASSIC 0
-#define TC_CBQ_OVL_DELAY 1
-#define TC_CBQ_OVL_LOWPRIO 2
-#define TC_CBQ_OVL_DROP 3
-#define TC_CBQ_OVL_RCLASSIC 4
- unsigned char priority2;
- __u16 pad;
- __u32 penalty;
-};
-
-struct tc_cbq_police {
- unsigned char police;
- unsigned char __res1;
- unsigned short __res2;
-};
-
-struct tc_cbq_fopt {
- __u32 split;
- __u32 defmap;
- __u32 defchange;
-};
-
-struct tc_cbq_xstats {
- __u32 borrows;
- __u32 overactions;
- __s32 avgidle;
- __s32 undertime;
-};
-
-enum {
- TCA_CBQ_UNSPEC,
- TCA_CBQ_LSSOPT,
- TCA_CBQ_WRROPT,
- TCA_CBQ_FOPT,
- TCA_CBQ_OVL_STRATEGY,
- TCA_CBQ_RATE,
- TCA_CBQ_RTAB,
- TCA_CBQ_POLICE,
- __TCA_CBQ_MAX,
-};
-
-#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1)
-
-/* dsmark section */
-
-enum {
- TCA_DSMARK_UNSPEC,
- TCA_DSMARK_INDICES,
- TCA_DSMARK_DEFAULT_INDEX,
- TCA_DSMARK_SET_TC_INDEX,
- TCA_DSMARK_MASK,
- TCA_DSMARK_VALUE,
- __TCA_DSMARK_MAX,
-};
-
-#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1)
-
-/* ATM section */
-
-enum {
- TCA_ATM_UNSPEC,
- TCA_ATM_FD, /* file/socket descriptor */
- TCA_ATM_PTR, /* pointer to descriptor - later */
- TCA_ATM_HDR, /* LL header */
- TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */
- TCA_ATM_ADDR, /* PVC address (for output only) */
- TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */
- __TCA_ATM_MAX,
-};
-
-#define TCA_ATM_MAX (__TCA_ATM_MAX - 1)
-
/* Network emulator */
enum {
diff --git a/include/uapi/linux/tc_act/tc_ipt.h b/include/uapi/linux/tc_act/tc_ipt.h
deleted file mode 100644
index c48d7da67..000000000
--- a/include/uapi/linux/tc_act/tc_ipt.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_TC_IPT_H
-#define __LINUX_TC_IPT_H
-
-#include <linux/pkt_cls.h>
-
-enum {
- TCA_IPT_UNSPEC,
- TCA_IPT_TABLE,
- TCA_IPT_HOOK,
- TCA_IPT_INDEX,
- TCA_IPT_CNT,
- TCA_IPT_TM,
- TCA_IPT_TARG,
- TCA_IPT_PAD,
- __TCA_IPT_MAX
-};
-#define TCA_IPT_MAX (__TCA_IPT_MAX - 1)
-
-#endif
diff --git a/ip/iplink_xstats.c b/ip/iplink_xstats.c
index 6c184c02c..8d367984e 100644
--- a/ip/iplink_xstats.c
+++ b/ip/iplink_xstats.c
@@ -63,7 +63,7 @@ int iplink_ifla_xstats(int argc, char **argv)
if (rtnl_statsdump_req_filter(&rth, AF_UNSPEC, filt_mask,
NULL, NULL) < 0) {
- perror("Cannont send dump request");
+ perror("Cannot send dump request");
return -1;
}
diff --git a/man/man8/tc-bfifo.8 b/man/man8/tc-bfifo.8
index 3e290322f..bc05ef4d8 100644
--- a/man/man8/tc-bfifo.8
+++ b/man/man8/tc-bfifo.8
@@ -37,8 +37,6 @@ If the list is too long, no further packets are allowed on. This is called 'tail
limit
Maximum queue size. Specified in bytes for bfifo, in packets for pfifo. For pfifo, defaults
to the interface txqueuelen, as specified with
-.BR ifconfig (8)
-or
.BR ip (8).
The range for this parameter is [0, UINT32_MAX].
diff --git a/man/man8/tc-gact.8 b/man/man8/tc-gact.8
new file mode 100644
index 000000000..81aa30eba
--- /dev/null
+++ b/man/man8/tc-gact.8
@@ -0,0 +1,85 @@
+.TH "Generic actions in tc" 8 "11 Jan 2023" "iproute2" "Linux"
+
+.SH NAME
+gact - generic action
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action gact"
+.IR CONTROL " [ " RAND " ] [ " INDEX " ]"
+.ti -8
+.IR CONTROL " := { "
+.BR reclassify " | " drop " | " continue " | " pass " | " pipe " | "
+.br
+.BI "goto chain " "CHAIN_INDEX"
+|
+.br
+.BI "jump " "JUMP_COUNT"
+}
+
+.ti -8
+.IR RAND " := "
+.BI random " RANDTYPE CONTROL VAL"
+.ti -8
+.IR RANDTYPE " := { "
+.BR netrand " | " determ " }"
+.ti -8
+.IR VAL " := number not exceeding 10000"
+.ti -8
+.IR JUMP_COUNT " := absolute jump from start of action list"
+.ti -8
+.IR INDEX " := index value used"
+
+.SH DESCRIPTION
+The
+.B gact
+action allows reclassify, dropping, passing, or accepting packets.
+At the moment there are only two algorithms. One is deterministic
+and the other uses internal kernel netrand.
+
+.SH OPTIONS
+.TP
+.BI random " RANDTYPE CONTROL VAL"
+The probability of taking the action expressed in terms of 1 out of
+.I VAL
+packets.
+
+.TP
+.I CONTROL
+Indicate how
+.B tc
+should proceed if the packet matches.
+For a description of the possible
+.I CONTROL
+values, see
+.BR tc-actions (8).
+
+.SH EXAMPLES
+Apply a rule on ingress to drop packets from a given source address.
+.RS
+.EX
+# tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
+10.0.0.9/32 flowid 1:16 action drop
+.EE
+.RE
+
+Allow 1 out 10 packets from source randomly using the netrand generator
+.RS
+.EX
+# tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
+10.0.0.9/32 flowid 1:16 action drop random netrand ok 10
+.EE
+.RE
+
+Deterministically accept every second packet
+.RS
+.EX
+# tc filter add dev eth0 parent ffff: protocol ip prio 6 u32 match ip src \
+10.0.0.9/32 flowid 1:16 action drop random determ ok 2
+.EE
+.RE
+
+.SH SEE ALSO
+.BR tc (8),
+.BR tc-actions (8),
+.BR tc-u32 (8)
diff --git a/man/man8/tc-mirred.8 b/man/man8/tc-mirred.8
index 38833b452..e529fa6a0 100644
--- a/man/man8/tc-mirred.8
+++ b/man/man8/tc-mirred.8
@@ -94,6 +94,14 @@ interface, it is possible to send ingress traffic through an instance of
.EE
.RE
+.SH LIMITIATIONS
+The kernel restricts nesting to four levels to avoid the chance
+of nesting loops.
+.PP
+Do not redirect for one IFB device to another.
+IFB is a very specialized case of packet redirecting device.
+Redirecting from ifbX->ifbY will cause all packets to be dropped.
+
.SH SEE ALSO
.BR tc (8),
.BR tc-u32 (8)
diff --git a/man/man8/tc-pfifo_fast.8 b/man/man8/tc-pfifo_fast.8
index baf34b1df..0029d67f4 100644
--- a/man/man8/tc-pfifo_fast.8
+++ b/man/man8/tc-pfifo_fast.8
@@ -27,8 +27,6 @@ have traffic, higher bands are never dequeued. This can be used to
prioritize interactive traffic or penalize 'lowest cost' traffic.
Each band can be txqueuelen packets long, as configured with
-.BR ifconfig (8)
-or
.BR ip (8).
Additional packets coming in are not enqueued but are instead dropped.
@@ -40,8 +38,6 @@ for complete details on how TOS bits are translated into bands.
txqueuelen
The length of the three bands depends on the interface txqueuelen, as
specified with
-.BR ifconfig (8)
-or
.BR ip (8).
.SH BUGS
diff --git a/man/man8/tc.8 b/man/man8/tc.8
index e5bef911f..3175454b9 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -871,6 +871,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2.
.BR tc-fq_codel (8),
.BR tc-fq_pie (8),
.BR tc-fw (8),
+.BR tc-gact (8),
.BR tc-hfsc (7),
.BR tc-hfsc (8),
.BR tc-htb (8),
diff --git a/misc/ss.c b/misc/ss.c
index 900fefa42..5296cabe9 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -2427,6 +2427,8 @@ static void proc_ctx_print(struct sockstat *s)
free(buf);
}
}
+
+ field_next();
}
static void inet_stats_print(struct sockstat *s, bool v6only)
diff --git a/tc/m_gate.c b/tc/m_gate.c
index c091ae19c..37afa426a 100644
--- a/tc/m_gate.c
+++ b/tc/m_gate.c
@@ -20,18 +20,6 @@ struct gate_entry {
int32_t maxoctets;
};
-#define CLOCKID_INVALID (-1)
-static const struct clockid_table {
- const char *name;
- clockid_t clockid;
-} clockt_map[] = {
- { "REALTIME", CLOCK_REALTIME },
- { "TAI", CLOCK_TAI },
- { "BOOTTIME", CLOCK_BOOTTIME },
- { "MONOTONIC", CLOCK_MONOTONIC },
- { NULL }
-};
-
static void explain(void)
{
fprintf(stderr,
@@ -78,35 +66,6 @@ struct action_util gate_action_util = {
.print_aopt = print_gate,
};
-static int get_clockid(__s32 *val, const char *arg)
-{
- const struct clockid_table *c;
-
- if (strcasestr(arg, "CLOCK_") != NULL)
- arg += sizeof("CLOCK_") - 1;
-
- for (c = clockt_map; c->name; c++) {
- if (strcasecmp(c->name, arg) == 0) {
- *val = c->clockid;
- return 0;
- }
- }
-
- return -1;
-}
-
-static const char *get_clock_name(clockid_t clockid)
-{
- const struct clockid_table *c;
-
- for (c = clockt_map; c->name; c++) {
- if (clockid == c->clockid)
- return c->name;
- }
-
- return "invalid";
-}
-
static int get_gate_state(__u8 *val, const char *arg)
{
if (!strcasecmp("OPEN", arg)) {
diff --git a/tc/q_etf.c b/tc/q_etf.c
index 572e2bc89..d16188daa 100644
--- a/tc/q_etf.c
+++ b/tc/q_etf.c
@@ -19,18 +19,6 @@
#include "utils.h"
#include "tc_util.h"
-#define CLOCKID_INVALID (-1)
-static const struct static_clockid {
- const char *name;
- clockid_t clockid;
-} clockids_sysv[] = {
- { "REALTIME", CLOCK_REALTIME },
- { "TAI", CLOCK_TAI },
- { "BOOTTIME", CLOCK_BOOTTIME },
- { "MONOTONIC", CLOCK_MONOTONIC },
- { NULL }
-};
-
static void explain(void)
{
fprintf(stderr,
@@ -51,37 +39,6 @@ static void explain_clockid(const char *val)
val);
}
-static int get_clockid(__s32 *val, const char *arg)
-{
- const struct static_clockid *c;
-
- /* Drop the CLOCK_ prefix if that is being used. */
- if (strcasestr(arg, "CLOCK_") != NULL)
- arg += sizeof("CLOCK_") - 1;
-
- for (c = clockids_sysv; c->name; c++) {
- if (strcasecmp(c->name, arg) == 0) {
- *val = c->clockid;
-
- return 0;
- }
- }
-
- return -1;
-}
-
-static const char* get_clock_name(clockid_t clockid)
-{
- const struct static_clockid *c;
-
- for (c = clockids_sysv; c->name; c++) {
- if (clockid == c->clockid)
- return c->name;
- }
-
- return "invalid";
-}
-
static int etf_parse_opt(struct qdisc_util *qu, int argc,
char **argv, struct nlmsghdr *n, const char *dev)
{
diff --git a/tc/q_taprio.c b/tc/q_taprio.c
index ef8fc7a05..c47fe2443 100644
--- a/tc/q_taprio.c
+++ b/tc/q_taprio.c
@@ -29,18 +29,6 @@ struct sched_entry {
uint8_t cmd;
};
-#define CLOCKID_INVALID (-1)
-static const struct static_clockid {
- const char *name;
- clockid_t clockid;
-} clockids_sysv[] = {
- { "REALTIME", CLOCK_REALTIME },
- { "TAI", CLOCK_TAI },
- { "BOOTTIME", CLOCK_BOOTTIME },
- { "MONOTONIC", CLOCK_MONOTONIC },
- { NULL }
-};
-
static void explain(void)
{
fprintf(stderr,
@@ -60,37 +48,6 @@ static void explain_clockid(const char *val)
fprintf(stderr, "It must be a valid SYS-V id (i.e. CLOCK_TAI)\n");
}
-static int get_clockid(__s32 *val, const char *arg)
-{
- const struct static_clockid *c;
-
- /* Drop the CLOCK_ prefix if that is being used. */
- if (strcasestr(arg, "CLOCK_") != NULL)
- arg += sizeof("CLOCK_") - 1;
-
- for (c = clockids_sysv; c->name; c++) {
- if (strcasecmp(c->name, arg) == 0) {
- *val = c->clockid;
-
- return 0;
- }
- }
-
- return -1;
-}
-
-static const char* get_clock_name(clockid_t clockid)
-{
- const struct static_clockid *c;
-
- for (c = clockids_sysv; c->name; c++) {
- if (clockid == c->clockid)
- return c->name;
- }
-
- return "invalid";
-}
-
static const char *entry_cmd_to_str(__u8 cmd)
{
switch (cmd) {
diff --git a/tc/tc_util.c b/tc/tc_util.c
index 8c0e19e45..aa7cf60fa 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -596,6 +596,60 @@ char *sprint_linklayer(unsigned int linklayer, char *buf)
return buf;
}
+/*
+ * Limited list of clockid's
+ * Since these are the ones the kernel qdisc can use
+ * because they are available via ktim_get
+ */
+static const struct clockid_table {
+ const char *name;
+ clockid_t clockid;
+} clockt_map[] = {
+#ifdef CLOCK_BOOTTIME
+ { "BOOTTIME", CLOCK_BOOTTIME },
+#endif
+#ifdef CLOCK_MONOTONIC
+ { "MONOTONIC", CLOCK_MONOTONIC },
+#endif
+#ifdef CLOCK_REALTIME
+ { "REALTIME", CLOCK_REALTIME },
+#endif
+#ifdef CLOCK_TAI
+ { "TAI", CLOCK_TAI },
+#endif
+ { NULL }
+};
+
+int get_clockid(__s32 *val, const char *arg)
+{
+ const struct clockid_table *c;
+
+ /* skip prefix if present */
+ if (strcasestr(arg, "CLOCK_") != NULL)
+ arg += sizeof("CLOCK_") - 1;
+
+ for (c = clockt_map; c->name; c++) {
+ if (strcasecmp(c->name, arg) == 0) {
+ *val = c->clockid;
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+const char *get_clock_name(clockid_t clockid)
+{
+ const struct clockid_table *c;
+
+ for (c = clockt_map; c->name; c++) {
+ if (clockid == c->clockid)
+ return c->name;
+ }
+
+ return "invalid";
+}
+
void print_tm(FILE *f, const struct tcf_t *tm)
{
int hz = get_user_hz();
diff --git a/tc/tc_util.h b/tc/tc_util.h
index c535dccbc..aaf10e433 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -121,6 +121,10 @@ int prio_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt);
int cls_names_init(char *path);
void cls_names_uninit(void);
+#define CLOCKID_INVALID (-1)
+int get_clockid(__s32 *val, const char *arg);
+const char *get_clock_name(clockid_t clockid);
+
int action_a2n(char *arg, int *result, bool allow_num);
bool tc_qdisc_block_exists(__u32 block_index);