1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2022 Meta
3 
4 #include <stddef.h>
5 #include <stdint.h>
6 #include <stdbool.h>
7 #include <linux/bpf.h>
8 #include <linux/stddef.h>
9 #include <linux/pkt_cls.h>
10 #include <linux/if_ether.h>
11 #include <linux/in.h>
12 #include <linux/ip.h>
13 #include <linux/ipv6.h>
14 #include <linux/tcp.h>
15 #include <linux/udp.h>
16 #include <bpf/bpf_helpers.h>
17 #include <bpf/bpf_endian.h>
18 
19 /* veth_src --- veth_src_fwd --- veth_det_fwd --- veth_dst
20  *           |                                 |
21  *  ns_src   |              ns_fwd             |   ns_dst
22  *
23  * ns_src and ns_dst: ENDHOST namespace
24  *            ns_fwd: Fowarding namespace
25  */
26 
27 #define ctx_ptr(field)		(void *)(long)(field)
28 
29 #define ip4_src			__bpf_htonl(0xac100164) /* 172.16.1.100 */
30 #define ip4_dst			__bpf_htonl(0xac100264) /* 172.16.2.100 */
31 
32 #define ip6_src			{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
33 				  0x00, 0x01, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
34 #define ip6_dst			{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
35 				  0x00, 0x02, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
36 
37 #define v6_equal(a, b)		(a.s6_addr32[0] == b.s6_addr32[0] && \
38 				 a.s6_addr32[1] == b.s6_addr32[1] && \
39 				 a.s6_addr32[2] == b.s6_addr32[2] && \
40 				 a.s6_addr32[3] == b.s6_addr32[3])
41 
42 volatile const __u32 IFINDEX_SRC;
43 volatile const __u32 IFINDEX_DST;
44 
45 #define EGRESS_ENDHOST_MAGIC	0x0b9fbeef
46 #define INGRESS_FWDNS_MAGIC	0x1b9fbeef
47 #define EGRESS_FWDNS_MAGIC	0x2b9fbeef
48 
49 enum {
50 	INGRESS_FWDNS_P100,
51 	INGRESS_FWDNS_P101,
52 	EGRESS_FWDNS_P100,
53 	EGRESS_FWDNS_P101,
54 	INGRESS_ENDHOST,
55 	EGRESS_ENDHOST,
56 	SET_DTIME,
57 	__MAX_CNT,
58 };
59 
60 enum {
61 	TCP_IP6_CLEAR_DTIME,
62 	TCP_IP4,
63 	TCP_IP6,
64 	UDP_IP4,
65 	UDP_IP6,
66 	TCP_IP4_RT_FWD,
67 	TCP_IP6_RT_FWD,
68 	UDP_IP4_RT_FWD,
69 	UDP_IP6_RT_FWD,
70 	UKN_TEST,
71 	__NR_TESTS,
72 };
73 
74 enum {
75 	SRC_NS = 1,
76 	DST_NS,
77 };
78 
79 __u32 dtimes[__NR_TESTS][__MAX_CNT] = {};
80 __u32 errs[__NR_TESTS][__MAX_CNT] = {};
81 __u32 test = 0;
82 
inc_dtimes(__u32 idx)83 static void inc_dtimes(__u32 idx)
84 {
85 	if (test < __NR_TESTS)
86 		dtimes[test][idx]++;
87 	else
88 		dtimes[UKN_TEST][idx]++;
89 }
90 
inc_errs(__u32 idx)91 static void inc_errs(__u32 idx)
92 {
93 	if (test < __NR_TESTS)
94 		errs[test][idx]++;
95 	else
96 		errs[UKN_TEST][idx]++;
97 }
98 
skb_proto(int type)99 static int skb_proto(int type)
100 {
101 	return type & 0xff;
102 }
103 
skb_ns(int type)104 static int skb_ns(int type)
105 {
106 	return (type >> 8) & 0xff;
107 }
108 
fwdns_clear_dtime(void)109 static bool fwdns_clear_dtime(void)
110 {
111 	return test == TCP_IP6_CLEAR_DTIME;
112 }
113 
bpf_fwd(void)114 static bool bpf_fwd(void)
115 {
116 	return test < TCP_IP4_RT_FWD;
117 }
118 
get_proto(void)119 static __u8 get_proto(void)
120 {
121 	switch (test) {
122 	case UDP_IP4:
123 	case UDP_IP6:
124 	case UDP_IP4_RT_FWD:
125 	case UDP_IP6_RT_FWD:
126 		return IPPROTO_UDP;
127 	default:
128 		return IPPROTO_TCP;
129 	}
130 }
131 
132 /* -1: parse error: TC_ACT_SHOT
133  *  0: not testing traffic: TC_ACT_OK
134  * >0: first byte is the inet_proto, second byte has the netns
135  *     of the sender
136  */
skb_get_type(struct __sk_buff * skb)137 static int skb_get_type(struct __sk_buff *skb)
138 {
139 	__u16 dst_ns_port = __bpf_htons(50000 + test);
140 	void *data_end = ctx_ptr(skb->data_end);
141 	void *data = ctx_ptr(skb->data);
142 	__u8 inet_proto = 0, ns = 0;
143 	struct ipv6hdr *ip6h;
144 	__u16 sport, dport;
145 	struct iphdr *iph;
146 	struct tcphdr *th;
147 	struct udphdr *uh;
148 	void *trans;
149 
150 	switch (skb->protocol) {
151 	case __bpf_htons(ETH_P_IP):
152 		iph = data + sizeof(struct ethhdr);
153 		if (iph + 1 > data_end)
154 			return -1;
155 		if (iph->saddr == ip4_src)
156 			ns = SRC_NS;
157 		else if (iph->saddr == ip4_dst)
158 			ns = DST_NS;
159 		inet_proto = iph->protocol;
160 		trans = iph + 1;
161 		break;
162 	case __bpf_htons(ETH_P_IPV6):
163 		ip6h = data + sizeof(struct ethhdr);
164 		if (ip6h + 1 > data_end)
165 			return -1;
166 		if (v6_equal(ip6h->saddr, (struct in6_addr){{ip6_src}}))
167 			ns = SRC_NS;
168 		else if (v6_equal(ip6h->saddr, (struct in6_addr){{ip6_dst}}))
169 			ns = DST_NS;
170 		inet_proto = ip6h->nexthdr;
171 		trans = ip6h + 1;
172 		break;
173 	default:
174 		return 0;
175 	}
176 
177 	/* skb is not from src_ns or dst_ns.
178 	 * skb is not the testing IPPROTO.
179 	 */
180 	if (!ns || inet_proto != get_proto())
181 		return 0;
182 
183 	switch (inet_proto) {
184 	case IPPROTO_TCP:
185 		th = trans;
186 		if (th + 1 > data_end)
187 			return -1;
188 		sport = th->source;
189 		dport = th->dest;
190 		break;
191 	case IPPROTO_UDP:
192 		uh = trans;
193 		if (uh + 1 > data_end)
194 			return -1;
195 		sport = uh->source;
196 		dport = uh->dest;
197 		break;
198 	default:
199 		return 0;
200 	}
201 
202 	/* The skb is the testing traffic */
203 	if ((ns == SRC_NS && dport == dst_ns_port) ||
204 	    (ns == DST_NS && sport == dst_ns_port))
205 		return (ns << 8 | inet_proto);
206 
207 	return 0;
208 }
209 
210 /* format: direction@iface@netns
211  * egress@veth_(src|dst)@ns_(src|dst)
212  */
213 SEC("tc")
egress_host(struct __sk_buff * skb)214 int egress_host(struct __sk_buff *skb)
215 {
216 	int skb_type;
217 
218 	skb_type = skb_get_type(skb);
219 	if (skb_type == -1)
220 		return TC_ACT_SHOT;
221 	if (!skb_type)
222 		return TC_ACT_OK;
223 
224 	if (skb_proto(skb_type) == IPPROTO_TCP) {
225 		if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC &&
226 		    skb->tstamp)
227 			inc_dtimes(EGRESS_ENDHOST);
228 		else
229 			inc_errs(EGRESS_ENDHOST);
230 	} else if (skb_proto(skb_type) == IPPROTO_UDP) {
231 		if (skb->tstamp_type == BPF_SKB_CLOCK_TAI &&
232 		    skb->tstamp)
233 			inc_dtimes(EGRESS_ENDHOST);
234 		else
235 			inc_errs(EGRESS_ENDHOST);
236 	} else {
237 		if (skb->tstamp_type == BPF_SKB_CLOCK_REALTIME &&
238 		    skb->tstamp)
239 			inc_errs(EGRESS_ENDHOST);
240 	}
241 
242 	skb->tstamp = EGRESS_ENDHOST_MAGIC;
243 
244 	return TC_ACT_OK;
245 }
246 
247 /* ingress@veth_(src|dst)@ns_(src|dst) */
248 SEC("tc")
ingress_host(struct __sk_buff * skb)249 int ingress_host(struct __sk_buff *skb)
250 {
251 	int skb_type;
252 
253 	skb_type = skb_get_type(skb);
254 	if (skb_type == -1)
255 		return TC_ACT_SHOT;
256 	if (!skb_type)
257 		return TC_ACT_OK;
258 
259 	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC &&
260 	    skb->tstamp == EGRESS_FWDNS_MAGIC)
261 		inc_dtimes(INGRESS_ENDHOST);
262 	else
263 		inc_errs(INGRESS_ENDHOST);
264 
265 	return TC_ACT_OK;
266 }
267 
268 /* ingress@veth_(src|dst)_fwd@ns_fwd priority 100 */
269 SEC("tc")
ingress_fwdns_prio100(struct __sk_buff * skb)270 int ingress_fwdns_prio100(struct __sk_buff *skb)
271 {
272 	int skb_type;
273 
274 	skb_type = skb_get_type(skb);
275 	if (skb_type == -1)
276 		return TC_ACT_SHOT;
277 	if (!skb_type)
278 		return TC_ACT_OK;
279 
280 	/* delivery_time is only available to the ingress
281 	 * if the tc-bpf checks the skb->tstamp_type.
282 	 */
283 	if (skb->tstamp == EGRESS_ENDHOST_MAGIC)
284 		inc_errs(INGRESS_FWDNS_P100);
285 
286 	if (fwdns_clear_dtime())
287 		skb->tstamp = 0;
288 
289 	return TC_ACT_UNSPEC;
290 }
291 
292 /* egress@veth_(src|dst)_fwd@ns_fwd priority 100 */
293 SEC("tc")
egress_fwdns_prio100(struct __sk_buff * skb)294 int egress_fwdns_prio100(struct __sk_buff *skb)
295 {
296 	int skb_type;
297 
298 	skb_type = skb_get_type(skb);
299 	if (skb_type == -1)
300 		return TC_ACT_SHOT;
301 	if (!skb_type)
302 		return TC_ACT_OK;
303 
304 	/* delivery_time is always available to egress even
305 	 * the tc-bpf did not use the tstamp_type.
306 	 */
307 	if (skb->tstamp == INGRESS_FWDNS_MAGIC)
308 		inc_dtimes(EGRESS_FWDNS_P100);
309 	else
310 		inc_errs(EGRESS_FWDNS_P100);
311 
312 	if (fwdns_clear_dtime())
313 		skb->tstamp = 0;
314 
315 	return TC_ACT_UNSPEC;
316 }
317 
318 /* ingress@veth_(src|dst)_fwd@ns_fwd priority 101 */
319 SEC("tc")
ingress_fwdns_prio101(struct __sk_buff * skb)320 int ingress_fwdns_prio101(struct __sk_buff *skb)
321 {
322 	int skb_type;
323 
324 	skb_type = skb_get_type(skb);
325 	if (skb_type == -1 || !skb_type)
326 		/* Should have handled in prio100 */
327 		return TC_ACT_SHOT;
328 
329 	if (skb->tstamp_type) {
330 		if (fwdns_clear_dtime() ||
331 		    (skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC &&
332 		    skb->tstamp_type != BPF_SKB_CLOCK_TAI) ||
333 		    skb->tstamp != EGRESS_ENDHOST_MAGIC)
334 			inc_errs(INGRESS_FWDNS_P101);
335 		else
336 			inc_dtimes(INGRESS_FWDNS_P101);
337 	} else {
338 		if (!fwdns_clear_dtime())
339 			inc_errs(INGRESS_FWDNS_P101);
340 	}
341 
342 	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) {
343 		skb->tstamp = INGRESS_FWDNS_MAGIC;
344 	} else {
345 		if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
346 				       BPF_SKB_CLOCK_MONOTONIC))
347 			inc_errs(SET_DTIME);
348 	}
349 
350 	if (skb_ns(skb_type) == SRC_NS)
351 		return bpf_fwd() ?
352 			bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0) : TC_ACT_OK;
353 	else
354 		return bpf_fwd() ?
355 			bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0) : TC_ACT_OK;
356 }
357 
358 /* egress@veth_(src|dst)_fwd@ns_fwd priority 101 */
359 SEC("tc")
egress_fwdns_prio101(struct __sk_buff * skb)360 int egress_fwdns_prio101(struct __sk_buff *skb)
361 {
362 	int skb_type;
363 
364 	skb_type = skb_get_type(skb);
365 	if (skb_type == -1 || !skb_type)
366 		/* Should have handled in prio100 */
367 		return TC_ACT_SHOT;
368 
369 	if (skb->tstamp_type) {
370 		if (fwdns_clear_dtime() ||
371 		    skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC ||
372 		    skb->tstamp != INGRESS_FWDNS_MAGIC)
373 			inc_errs(EGRESS_FWDNS_P101);
374 		else
375 			inc_dtimes(EGRESS_FWDNS_P101);
376 	} else {
377 		if (!fwdns_clear_dtime())
378 			inc_errs(EGRESS_FWDNS_P101);
379 	}
380 
381 	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) {
382 		skb->tstamp = EGRESS_FWDNS_MAGIC;
383 	} else {
384 		if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
385 				       BPF_SKB_CLOCK_MONOTONIC))
386 			inc_errs(SET_DTIME);
387 	}
388 
389 	return TC_ACT_OK;
390 }
391 
392 char __license[] SEC("license") = "GPL";
393