1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /*
4  * This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
5  * between src and dst. The netns fwd has veth links to each src and dst. The
6  * client is in src and server in dst. The test installs a TC BPF program to each
7  * host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
8  * neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
9  * switch from ingress side; it also installs a checker prog on the egress side
10  * to drop unexpected traffic.
11  */
12 
13 #include <arpa/inet.h>
14 #include <linux/if_tun.h>
15 #include <linux/limits.h>
16 #include <linux/sysctl.h>
17 #include <linux/time_types.h>
18 #include <linux/net_tstamp.h>
19 #include <net/if.h>
20 #include <stdbool.h>
21 #include <stdio.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include "test_progs.h"
26 #include "network_helpers.h"
27 #include "netlink_helpers.h"
28 #include "test_tc_neigh_fib.skel.h"
29 #include "test_tc_neigh.skel.h"
30 #include "test_tc_peer.skel.h"
31 #include "test_tc_dtime.skel.h"
32 
33 #ifndef TCP_TX_DELAY
34 #define TCP_TX_DELAY 37
35 #endif
36 
37 #define NS_SRC "ns_src"
38 #define NS_FWD "ns_fwd"
39 #define NS_DST "ns_dst"
40 
41 #define IP4_SRC "172.16.1.100"
42 #define IP4_DST "172.16.2.100"
43 #define IP4_TUN_SRC "172.17.1.100"
44 #define IP4_TUN_FWD "172.17.1.200"
45 #define IP4_PORT 9004
46 
47 #define IP6_SRC "0::1:dead:beef:cafe"
48 #define IP6_DST "0::2:dead:beef:cafe"
49 #define IP6_TUN_SRC "1::1:dead:beef:cafe"
50 #define IP6_TUN_FWD "1::2:dead:beef:cafe"
51 #define IP6_PORT 9006
52 
53 #define IP4_SLL "169.254.0.1"
54 #define IP4_DLL "169.254.0.2"
55 #define IP4_NET "169.254.0.0"
56 
57 #define MAC_DST_FWD "00:11:22:33:44:55"
58 #define MAC_DST "00:22:33:44:55:66"
59 
60 #define IFADDR_STR_LEN 18
61 #define PING_ARGS "-i 0.2 -c 3 -w 10 -q"
62 
63 #define TIMEOUT_MILLIS 10000
64 #define NSEC_PER_SEC 1000000000ULL
65 
66 #define log_err(MSG, ...) \
67 	fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
68 		__FILE__, __LINE__, strerror(errno), ##__VA_ARGS__)
69 
70 static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL};
71 static struct netns_obj *netns_objs[3];
72 
write_file(const char * path,const char * newval)73 static int write_file(const char *path, const char *newval)
74 {
75 	FILE *f;
76 
77 	f = fopen(path, "r+");
78 	if (!f)
79 		return -1;
80 	if (fwrite(newval, strlen(newval), 1, f) != 1) {
81 		log_err("writing to %s failed", path);
82 		fclose(f);
83 		return -1;
84 	}
85 	fclose(f);
86 	return 0;
87 }
88 
netns_setup_namespaces(const char * verb)89 static int netns_setup_namespaces(const char *verb)
90 {
91 	struct netns_obj **ns_obj = netns_objs;
92 	const char * const *ns = namespaces;
93 
94 	while (*ns) {
95 		if (strcmp(verb, "add") == 0) {
96 			*ns_obj = netns_new(*ns, false);
97 			if (!ASSERT_OK_PTR(*ns_obj, "netns_new"))
98 				return -1;
99 		} else {
100 			if (!ASSERT_OK_PTR(*ns_obj, "netns_obj is NULL"))
101 				return -1;
102 			netns_free(*ns_obj);
103 			*ns_obj = NULL;
104 		}
105 		ns++;
106 		ns_obj++;
107 	}
108 	return 0;
109 }
110 
netns_setup_namespaces_nofail(const char * verb)111 static void netns_setup_namespaces_nofail(const char *verb)
112 {
113 	struct netns_obj **ns_obj = netns_objs;
114 	const char * const *ns = namespaces;
115 
116 	while (*ns) {
117 		if (strcmp(verb, "add") == 0) {
118 			*ns_obj = netns_new(*ns, false);
119 		} else {
120 			if (*ns_obj)
121 				netns_free(*ns_obj);
122 			*ns_obj = NULL;
123 		}
124 		ns++;
125 		ns_obj++;
126 	}
127 }
128 
129 enum dev_mode {
130 	MODE_VETH,
131 	MODE_NETKIT,
132 };
133 
134 struct netns_setup_result {
135 	enum dev_mode dev_mode;
136 	int ifindex_src;
137 	int ifindex_src_fwd;
138 	int ifindex_dst;
139 	int ifindex_dst_fwd;
140 };
141 
get_ifaddr(const char * name,char * ifaddr)142 static int get_ifaddr(const char *name, char *ifaddr)
143 {
144 	char path[PATH_MAX];
145 	FILE *f;
146 	int ret;
147 
148 	snprintf(path, PATH_MAX, "/sys/class/net/%s/address", name);
149 	f = fopen(path, "r");
150 	if (!ASSERT_OK_PTR(f, path))
151 		return -1;
152 
153 	ret = fread(ifaddr, 1, IFADDR_STR_LEN, f);
154 	if (!ASSERT_EQ(ret, IFADDR_STR_LEN, "fread ifaddr")) {
155 		fclose(f);
156 		return -1;
157 	}
158 	fclose(f);
159 	return 0;
160 }
161 
create_netkit(int mode,char * prim,char * peer)162 static int create_netkit(int mode, char *prim, char *peer)
163 {
164 	struct rtattr *linkinfo, *data, *peer_info;
165 	struct rtnl_handle rth = { .fd = -1 };
166 	const char *type = "netkit";
167 	struct {
168 		struct nlmsghdr n;
169 		struct ifinfomsg i;
170 		char buf[1024];
171 	} req = {};
172 	int err;
173 
174 	err = rtnl_open(&rth, 0);
175 	if (!ASSERT_OK(err, "open_rtnetlink"))
176 		return err;
177 
178 	memset(&req, 0, sizeof(req));
179 	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
180 	req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
181 	req.n.nlmsg_type = RTM_NEWLINK;
182 	req.i.ifi_family = AF_UNSPEC;
183 
184 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, prim, strlen(prim));
185 	linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO);
186 	addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type));
187 	data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA);
188 	addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode);
189 	peer_info = addattr_nest(&req.n, sizeof(req), IFLA_NETKIT_PEER_INFO);
190 	req.n.nlmsg_len += sizeof(struct ifinfomsg);
191 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer));
192 	addattr_nest_end(&req.n, peer_info);
193 	addattr_nest_end(&req.n, data);
194 	addattr_nest_end(&req.n, linkinfo);
195 
196 	err = rtnl_talk(&rth, &req.n, NULL);
197 	ASSERT_OK(err, "talk_rtnetlink");
198 	rtnl_close(&rth);
199 	return err;
200 }
201 
netns_setup_links_and_routes(struct netns_setup_result * result)202 static int netns_setup_links_and_routes(struct netns_setup_result *result)
203 {
204 	struct nstoken *nstoken = NULL;
205 	char src_fwd_addr[IFADDR_STR_LEN+1] = {};
206 	char src_addr[IFADDR_STR_LEN + 1] = {};
207 	int err;
208 
209 	if (result->dev_mode == MODE_VETH) {
210 		SYS(fail, "ip link add src type veth peer name src_fwd");
211 		SYS(fail, "ip link add dst type veth peer name dst_fwd");
212 
213 		SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD);
214 		SYS(fail, "ip link set dst address " MAC_DST);
215 	} else if (result->dev_mode == MODE_NETKIT) {
216 		err = create_netkit(NETKIT_L3, "src", "src_fwd");
217 		if (!ASSERT_OK(err, "create_ifindex_src"))
218 			goto fail;
219 		err = create_netkit(NETKIT_L3, "dst", "dst_fwd");
220 		if (!ASSERT_OK(err, "create_ifindex_dst"))
221 			goto fail;
222 	}
223 
224 	if (get_ifaddr("src_fwd", src_fwd_addr))
225 		goto fail;
226 
227 	if (get_ifaddr("src", src_addr))
228 		goto fail;
229 
230 	result->ifindex_src = if_nametoindex("src");
231 	if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src"))
232 		goto fail;
233 
234 	result->ifindex_src_fwd = if_nametoindex("src_fwd");
235 	if (!ASSERT_GT(result->ifindex_src_fwd, 0, "ifindex_src_fwd"))
236 		goto fail;
237 
238 	result->ifindex_dst = if_nametoindex("dst");
239 	if (!ASSERT_GT(result->ifindex_dst, 0, "ifindex_dst"))
240 		goto fail;
241 
242 	result->ifindex_dst_fwd = if_nametoindex("dst_fwd");
243 	if (!ASSERT_GT(result->ifindex_dst_fwd, 0, "ifindex_dst_fwd"))
244 		goto fail;
245 
246 	SYS(fail, "ip link set src netns " NS_SRC);
247 	SYS(fail, "ip link set src_fwd netns " NS_FWD);
248 	SYS(fail, "ip link set dst_fwd netns " NS_FWD);
249 	SYS(fail, "ip link set dst netns " NS_DST);
250 
251 	/** setup in 'src' namespace */
252 	nstoken = open_netns(NS_SRC);
253 	if (!ASSERT_OK_PTR(nstoken, "setns src"))
254 		goto fail;
255 
256 	SYS(fail, "ip addr add " IP4_SRC "/32 dev src");
257 	SYS(fail, "ip addr add " IP6_SRC "/128 dev src nodad");
258 	SYS(fail, "ip link set dev src up");
259 
260 	SYS(fail, "ip route add " IP4_DST "/32 dev src scope global");
261 	SYS(fail, "ip route add " IP4_NET "/16 dev src scope global");
262 	SYS(fail, "ip route add " IP6_DST "/128 dev src scope global");
263 
264 	if (result->dev_mode == MODE_VETH) {
265 		SYS(fail, "ip neigh add " IP4_DST " dev src lladdr %s",
266 		    src_fwd_addr);
267 		SYS(fail, "ip neigh add " IP6_DST " dev src lladdr %s",
268 		    src_fwd_addr);
269 	}
270 
271 	close_netns(nstoken);
272 
273 	/** setup in 'fwd' namespace */
274 	nstoken = open_netns(NS_FWD);
275 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
276 		goto fail;
277 
278 	/* The fwd netns automatically gets a v6 LL address / routes, but also
279 	 * needs v4 one in order to start ARP probing. IP4_NET route is added
280 	 * to the endpoints so that the ARP processing will reply.
281 	 */
282 	SYS(fail, "ip addr add " IP4_SLL "/32 dev src_fwd");
283 	SYS(fail, "ip addr add " IP4_DLL "/32 dev dst_fwd");
284 	SYS(fail, "ip link set dev src_fwd up");
285 	SYS(fail, "ip link set dev dst_fwd up");
286 
287 	SYS(fail, "ip route add " IP4_SRC "/32 dev src_fwd scope global");
288 	SYS(fail, "ip route add " IP6_SRC "/128 dev src_fwd scope global");
289 	SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global");
290 	SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global");
291 
292 	if (result->dev_mode == MODE_VETH) {
293 		SYS(fail, "ip neigh add " IP4_SRC " dev src_fwd lladdr %s", src_addr);
294 		SYS(fail, "ip neigh add " IP6_SRC " dev src_fwd lladdr %s", src_addr);
295 		SYS(fail, "ip neigh add " IP4_DST " dev dst_fwd lladdr %s", MAC_DST);
296 		SYS(fail, "ip neigh add " IP6_DST " dev dst_fwd lladdr %s", MAC_DST);
297 	}
298 
299 	close_netns(nstoken);
300 
301 	/** setup in 'dst' namespace */
302 	nstoken = open_netns(NS_DST);
303 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
304 		goto fail;
305 
306 	SYS(fail, "ip addr add " IP4_DST "/32 dev dst");
307 	SYS(fail, "ip addr add " IP6_DST "/128 dev dst nodad");
308 	SYS(fail, "ip link set dev dst up");
309 	SYS(fail, "ip link set dev lo up");
310 
311 	SYS(fail, "ip route add " IP4_SRC "/32 dev dst scope global");
312 	SYS(fail, "ip route add " IP4_NET "/16 dev dst scope global");
313 	SYS(fail, "ip route add " IP6_SRC "/128 dev dst scope global");
314 
315 	if (result->dev_mode == MODE_VETH) {
316 		SYS(fail, "ip neigh add " IP4_SRC " dev dst lladdr " MAC_DST_FWD);
317 		SYS(fail, "ip neigh add " IP6_SRC " dev dst lladdr " MAC_DST_FWD);
318 	}
319 
320 	close_netns(nstoken);
321 
322 	return 0;
323 fail:
324 	if (nstoken)
325 		close_netns(nstoken);
326 	return -1;
327 }
328 
qdisc_clsact_create(struct bpf_tc_hook * qdisc_hook,int ifindex)329 static int qdisc_clsact_create(struct bpf_tc_hook *qdisc_hook, int ifindex)
330 {
331 	char err_str[128], ifname[16];
332 	int err;
333 
334 	qdisc_hook->ifindex = ifindex;
335 	qdisc_hook->attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS;
336 	err = bpf_tc_hook_create(qdisc_hook);
337 	snprintf(err_str, sizeof(err_str),
338 		 "qdisc add dev %s clsact",
339 		 if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>");
340 	err_str[sizeof(err_str) - 1] = 0;
341 	ASSERT_OK(err, err_str);
342 
343 	return err;
344 }
345 
xgress_filter_add(struct bpf_tc_hook * qdisc_hook,enum bpf_tc_attach_point xgress,const struct bpf_program * prog,int priority)346 static int xgress_filter_add(struct bpf_tc_hook *qdisc_hook,
347 			     enum bpf_tc_attach_point xgress,
348 			     const struct bpf_program *prog, int priority)
349 {
350 	LIBBPF_OPTS(bpf_tc_opts, tc_attach);
351 	char err_str[128], ifname[16];
352 	int err;
353 
354 	qdisc_hook->attach_point = xgress;
355 	tc_attach.prog_fd = bpf_program__fd(prog);
356 	tc_attach.priority = priority;
357 	err = bpf_tc_attach(qdisc_hook, &tc_attach);
358 	snprintf(err_str, sizeof(err_str),
359 		 "filter add dev %s %s prio %d bpf da %s",
360 		 if_indextoname(qdisc_hook->ifindex, ifname) ? : "<unknown_iface>",
361 		 xgress == BPF_TC_INGRESS ? "ingress" : "egress",
362 		 priority, bpf_program__name(prog));
363 	err_str[sizeof(err_str) - 1] = 0;
364 	ASSERT_OK(err, err_str);
365 
366 	return err;
367 }
368 
369 #define QDISC_CLSACT_CREATE(qdisc_hook, ifindex) ({		\
370 	if ((err = qdisc_clsact_create(qdisc_hook, ifindex)))	\
371 		goto fail;					\
372 })
373 
374 #define XGRESS_FILTER_ADD(qdisc_hook, xgress, prog, priority) ({		\
375 	if ((err = xgress_filter_add(qdisc_hook, xgress, prog, priority)))	\
376 		goto fail;							\
377 })
378 
netns_load_bpf(const struct bpf_program * src_prog,const struct bpf_program * dst_prog,const struct bpf_program * chk_prog,const struct netns_setup_result * setup_result)379 static int netns_load_bpf(const struct bpf_program *src_prog,
380 			  const struct bpf_program *dst_prog,
381 			  const struct bpf_program *chk_prog,
382 			  const struct netns_setup_result *setup_result)
383 {
384 	LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd);
385 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
386 	int err;
387 
388 	/* tc qdisc add dev src_fwd clsact */
389 	QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd);
390 	/* tc filter add dev src_fwd ingress bpf da src_prog */
391 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, src_prog, 0);
392 	/* tc filter add dev src_fwd egress bpf da chk_prog */
393 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, chk_prog, 0);
394 
395 	/* tc qdisc add dev dst_fwd clsact */
396 	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
397 	/* tc filter add dev dst_fwd ingress bpf da dst_prog */
398 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, dst_prog, 0);
399 	/* tc filter add dev dst_fwd egress bpf da chk_prog */
400 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, chk_prog, 0);
401 
402 	return 0;
403 fail:
404 	return -1;
405 }
406 
test_tcp(int family,const char * addr,__u16 port)407 static void test_tcp(int family, const char *addr, __u16 port)
408 {
409 	int listen_fd = -1, accept_fd = -1, client_fd = -1;
410 	char buf[] = "testing testing";
411 	int n;
412 	struct nstoken *nstoken;
413 
414 	nstoken = open_netns(NS_DST);
415 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
416 		return;
417 
418 	listen_fd = start_server(family, SOCK_STREAM, addr, port, 0);
419 	if (!ASSERT_GE(listen_fd, 0, "listen"))
420 		goto done;
421 
422 	close_netns(nstoken);
423 	nstoken = open_netns(NS_SRC);
424 	if (!ASSERT_OK_PTR(nstoken, "setns src"))
425 		goto done;
426 
427 	client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
428 	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
429 		goto done;
430 
431 	accept_fd = accept(listen_fd, NULL, NULL);
432 	if (!ASSERT_GE(accept_fd, 0, "accept"))
433 		goto done;
434 
435 	if (!ASSERT_OK(settimeo(accept_fd, TIMEOUT_MILLIS), "settimeo"))
436 		goto done;
437 
438 	n = write(client_fd, buf, sizeof(buf));
439 	if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
440 		goto done;
441 
442 	n = read(accept_fd, buf, sizeof(buf));
443 	ASSERT_EQ(n, sizeof(buf), "recv from server");
444 
445 done:
446 	if (nstoken)
447 		close_netns(nstoken);
448 	if (listen_fd >= 0)
449 		close(listen_fd);
450 	if (accept_fd >= 0)
451 		close(accept_fd);
452 	if (client_fd >= 0)
453 		close(client_fd);
454 }
455 
test_ping(int family,const char * addr)456 static int test_ping(int family, const char *addr)
457 {
458 	SYS(fail, "ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping_command(family), addr);
459 	return 0;
460 fail:
461 	return -1;
462 }
463 
test_connectivity(void)464 static void test_connectivity(void)
465 {
466 	test_tcp(AF_INET, IP4_DST, IP4_PORT);
467 	test_ping(AF_INET, IP4_DST);
468 	test_tcp(AF_INET6, IP6_DST, IP6_PORT);
469 	test_ping(AF_INET6, IP6_DST);
470 }
471 
set_forwarding(bool enable)472 static int set_forwarding(bool enable)
473 {
474 	int err;
475 
476 	err = write_file("/proc/sys/net/ipv4/ip_forward", enable ? "1" : "0");
477 	if (!ASSERT_OK(err, "set ipv4.ip_forward=0"))
478 		return err;
479 
480 	err = write_file("/proc/sys/net/ipv6/conf/all/forwarding", enable ? "1" : "0");
481 	if (!ASSERT_OK(err, "set ipv6.forwarding=0"))
482 		return err;
483 
484 	return 0;
485 }
486 
__rcv_tstamp(int fd,const char * expected,size_t s,__u64 * tstamp)487 static int __rcv_tstamp(int fd, const char *expected, size_t s, __u64 *tstamp)
488 {
489 	struct timespec pkt_ts = {};
490 	char ctl[CMSG_SPACE(sizeof(pkt_ts))];
491 	struct timespec now_ts;
492 	struct msghdr msg = {};
493 	__u64 now_ns, pkt_ns;
494 	struct cmsghdr *cmsg;
495 	struct iovec iov;
496 	char data[32];
497 	int ret;
498 
499 	iov.iov_base = data;
500 	iov.iov_len = sizeof(data);
501 	msg.msg_iov = &iov;
502 	msg.msg_iovlen = 1;
503 	msg.msg_control = &ctl;
504 	msg.msg_controllen = sizeof(ctl);
505 
506 	ret = recvmsg(fd, &msg, 0);
507 	if (!ASSERT_EQ(ret, s, "recvmsg"))
508 		return -1;
509 	ASSERT_STRNEQ(data, expected, s, "expected rcv data");
510 
511 	cmsg = CMSG_FIRSTHDR(&msg);
512 	if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
513 	    cmsg->cmsg_type == SO_TIMESTAMPNS)
514 		memcpy(&pkt_ts, CMSG_DATA(cmsg), sizeof(pkt_ts));
515 
516 	pkt_ns = pkt_ts.tv_sec * NSEC_PER_SEC + pkt_ts.tv_nsec;
517 	if (tstamp) {
518 		/* caller will check the tstamp itself */
519 		*tstamp = pkt_ns;
520 		return 0;
521 	}
522 
523 	ASSERT_NEQ(pkt_ns, 0, "pkt rcv tstamp");
524 
525 	ret = clock_gettime(CLOCK_REALTIME, &now_ts);
526 	ASSERT_OK(ret, "clock_gettime");
527 	now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
528 
529 	if (ASSERT_GE(now_ns, pkt_ns, "check rcv tstamp"))
530 		ASSERT_LT(now_ns - pkt_ns, 5 * NSEC_PER_SEC,
531 			  "check rcv tstamp");
532 	return 0;
533 }
534 
rcv_tstamp(int fd,const char * expected,size_t s)535 static void rcv_tstamp(int fd, const char *expected, size_t s)
536 {
537 	__rcv_tstamp(fd, expected, s, NULL);
538 }
539 
wait_netstamp_needed_key(void)540 static int wait_netstamp_needed_key(void)
541 {
542 	int opt = 1, srv_fd = -1, cli_fd = -1, nretries = 0, err, n;
543 	char buf[] = "testing testing";
544 	struct nstoken *nstoken;
545 	__u64 tstamp = 0;
546 
547 	nstoken = open_netns(NS_DST);
548 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
549 		return -1;
550 
551 	srv_fd = start_server(AF_INET6, SOCK_DGRAM, "::1", 0, 0);
552 	if (!ASSERT_GE(srv_fd, 0, "start_server"))
553 		goto done;
554 
555 	err = setsockopt(srv_fd, SOL_SOCKET, SO_TIMESTAMPNS,
556 			 &opt, sizeof(opt));
557 	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)"))
558 		goto done;
559 
560 	cli_fd = connect_to_fd(srv_fd, TIMEOUT_MILLIS);
561 	if (!ASSERT_GE(cli_fd, 0, "connect_to_fd"))
562 		goto done;
563 
564 again:
565 	n = write(cli_fd, buf, sizeof(buf));
566 	if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
567 		goto done;
568 	err = __rcv_tstamp(srv_fd, buf, sizeof(buf), &tstamp);
569 	if (!ASSERT_OK(err, "__rcv_tstamp"))
570 		goto done;
571 	if (!tstamp && nretries++ < 5) {
572 		sleep(1);
573 		printf("netstamp_needed_key retry#%d\n", nretries);
574 		goto again;
575 	}
576 
577 done:
578 	if (!tstamp && srv_fd != -1) {
579 		close(srv_fd);
580 		srv_fd = -1;
581 	}
582 	if (cli_fd != -1)
583 		close(cli_fd);
584 	close_netns(nstoken);
585 	return srv_fd;
586 }
587 
snd_tstamp(int fd,char * b,size_t s)588 static void snd_tstamp(int fd, char *b, size_t s)
589 {
590 	struct sock_txtime opt = { .clockid = CLOCK_TAI };
591 	char ctl[CMSG_SPACE(sizeof(__u64))];
592 	struct timespec now_ts;
593 	struct msghdr msg = {};
594 	struct cmsghdr *cmsg;
595 	struct iovec iov;
596 	__u64 now_ns;
597 	int ret;
598 
599 	ret = clock_gettime(CLOCK_TAI, &now_ts);
600 	ASSERT_OK(ret, "clock_get_time(CLOCK_TAI)");
601 	now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
602 
603 	iov.iov_base = b;
604 	iov.iov_len = s;
605 	msg.msg_iov = &iov;
606 	msg.msg_iovlen = 1;
607 	msg.msg_control = &ctl;
608 	msg.msg_controllen = sizeof(ctl);
609 
610 	cmsg = CMSG_FIRSTHDR(&msg);
611 	cmsg->cmsg_level = SOL_SOCKET;
612 	cmsg->cmsg_type = SCM_TXTIME;
613 	cmsg->cmsg_len = CMSG_LEN(sizeof(now_ns));
614 	*(__u64 *)CMSG_DATA(cmsg) = now_ns;
615 
616 	ret = setsockopt(fd, SOL_SOCKET, SO_TXTIME, &opt, sizeof(opt));
617 	ASSERT_OK(ret, "setsockopt(SO_TXTIME)");
618 
619 	ret = sendmsg(fd, &msg, 0);
620 	ASSERT_EQ(ret, s, "sendmsg");
621 }
622 
test_inet_dtime(int family,int type,const char * addr,__u16 port)623 static void test_inet_dtime(int family, int type, const char *addr, __u16 port)
624 {
625 	int opt = 1, accept_fd = -1, client_fd = -1, listen_fd, err;
626 	char buf[] = "testing testing";
627 	struct nstoken *nstoken;
628 
629 	nstoken = open_netns(NS_DST);
630 	if (!ASSERT_OK_PTR(nstoken, "setns dst"))
631 		return;
632 	listen_fd = start_server(family, type, addr, port, 0);
633 	close_netns(nstoken);
634 
635 	if (!ASSERT_GE(listen_fd, 0, "listen"))
636 		return;
637 
638 	/* Ensure the kernel puts the (rcv) timestamp for all skb */
639 	err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS,
640 			 &opt, sizeof(opt));
641 	if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS)"))
642 		goto done;
643 
644 	if (type == SOCK_STREAM) {
645 		/* Ensure the kernel set EDT when sending out rst/ack
646 		 * from the kernel's ctl_sk.
647 		 */
648 		err = setsockopt(listen_fd, SOL_TCP, TCP_TX_DELAY, &opt,
649 				 sizeof(opt));
650 		if (!ASSERT_OK(err, "setsockopt(TCP_TX_DELAY)"))
651 			goto done;
652 	}
653 
654 	nstoken = open_netns(NS_SRC);
655 	if (!ASSERT_OK_PTR(nstoken, "setns src"))
656 		goto done;
657 	client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
658 	close_netns(nstoken);
659 
660 	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
661 		goto done;
662 
663 	if (type == SOCK_STREAM) {
664 		int n;
665 
666 		accept_fd = accept(listen_fd, NULL, NULL);
667 		if (!ASSERT_GE(accept_fd, 0, "accept"))
668 			goto done;
669 
670 		n = write(client_fd, buf, sizeof(buf));
671 		if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
672 			goto done;
673 		rcv_tstamp(accept_fd, buf, sizeof(buf));
674 	} else {
675 		snd_tstamp(client_fd, buf, sizeof(buf));
676 		rcv_tstamp(listen_fd, buf, sizeof(buf));
677 	}
678 
679 done:
680 	close(listen_fd);
681 	if (accept_fd != -1)
682 		close(accept_fd);
683 	if (client_fd != -1)
684 		close(client_fd);
685 }
686 
netns_load_dtime_bpf(struct test_tc_dtime * skel,const struct netns_setup_result * setup_result)687 static int netns_load_dtime_bpf(struct test_tc_dtime *skel,
688 				const struct netns_setup_result *setup_result)
689 {
690 	LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd);
691 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
692 	LIBBPF_OPTS(bpf_tc_hook, qdisc_src);
693 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst);
694 	struct nstoken *nstoken;
695 	int err;
696 
697 	/* setup ns_src tc progs */
698 	nstoken = open_netns(NS_SRC);
699 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
700 		return -1;
701 	/* tc qdisc add dev src clsact */
702 	QDISC_CLSACT_CREATE(&qdisc_src, setup_result->ifindex_src);
703 	/* tc filter add dev src ingress bpf da ingress_host */
704 	XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
705 	/* tc filter add dev src egress bpf da egress_host */
706 	XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_EGRESS, skel->progs.egress_host, 0);
707 	close_netns(nstoken);
708 
709 	/* setup ns_dst tc progs */
710 	nstoken = open_netns(NS_DST);
711 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST))
712 		return -1;
713 	/* tc qdisc add dev dst clsact */
714 	QDISC_CLSACT_CREATE(&qdisc_dst, setup_result->ifindex_dst);
715 	/* tc filter add dev dst ingress bpf da ingress_host */
716 	XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0);
717 	/* tc filter add dev dst egress bpf da egress_host */
718 	XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0);
719 	close_netns(nstoken);
720 
721 	/* setup ns_fwd tc progs */
722 	nstoken = open_netns(NS_FWD);
723 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
724 		return -1;
725 	/* tc qdisc add dev dst_fwd clsact */
726 	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
727 	/* tc filter add dev dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
728 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS,
729 			  skel->progs.ingress_fwdns_prio100, 100);
730 	/* tc filter add dev dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
731 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS,
732 			  skel->progs.ingress_fwdns_prio101, 101);
733 	/* tc filter add dev dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */
734 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS,
735 			  skel->progs.egress_fwdns_prio100, 100);
736 	/* tc filter add dev dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */
737 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS,
738 			  skel->progs.egress_fwdns_prio101, 101);
739 
740 	/* tc qdisc add dev src_fwd clsact */
741 	QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd);
742 	/* tc filter add dev src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */
743 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS,
744 			  skel->progs.ingress_fwdns_prio100, 100);
745 	/* tc filter add dev src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */
746 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS,
747 			  skel->progs.ingress_fwdns_prio101, 101);
748 	/* tc filter add dev src_fwd egress prio 100 bpf da egress_fwdns_prio100 */
749 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS,
750 			  skel->progs.egress_fwdns_prio100, 100);
751 	/* tc filter add dev src_fwd egress prio 101 bpf da egress_fwdns_prio101 */
752 	XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS,
753 			  skel->progs.egress_fwdns_prio101, 101);
754 	close_netns(nstoken);
755 	return 0;
756 
757 fail:
758 	close_netns(nstoken);
759 	return err;
760 }
761 
762 enum {
763 	INGRESS_FWDNS_P100,
764 	INGRESS_FWDNS_P101,
765 	EGRESS_FWDNS_P100,
766 	EGRESS_FWDNS_P101,
767 	INGRESS_ENDHOST,
768 	EGRESS_ENDHOST,
769 	SET_DTIME,
770 	__MAX_CNT,
771 };
772 
773 const char *cnt_names[] = {
774 	"ingress_fwdns_p100",
775 	"ingress_fwdns_p101",
776 	"egress_fwdns_p100",
777 	"egress_fwdns_p101",
778 	"ingress_endhost",
779 	"egress_endhost",
780 	"set_dtime",
781 };
782 
783 enum {
784 	TCP_IP6_CLEAR_DTIME,
785 	TCP_IP4,
786 	TCP_IP6,
787 	UDP_IP4,
788 	UDP_IP6,
789 	TCP_IP4_RT_FWD,
790 	TCP_IP6_RT_FWD,
791 	UDP_IP4_RT_FWD,
792 	UDP_IP6_RT_FWD,
793 	UKN_TEST,
794 	__NR_TESTS,
795 };
796 
797 const char *test_names[] = {
798 	"tcp ip6 clear dtime",
799 	"tcp ip4",
800 	"tcp ip6",
801 	"udp ip4",
802 	"udp ip6",
803 	"tcp ip4 rt fwd",
804 	"tcp ip6 rt fwd",
805 	"udp ip4 rt fwd",
806 	"udp ip6 rt fwd",
807 };
808 
dtime_cnt_str(int test,int cnt)809 static const char *dtime_cnt_str(int test, int cnt)
810 {
811 	static char name[64];
812 
813 	snprintf(name, sizeof(name), "%s %s", test_names[test], cnt_names[cnt]);
814 
815 	return name;
816 }
817 
dtime_err_str(int test,int cnt)818 static const char *dtime_err_str(int test, int cnt)
819 {
820 	static char name[64];
821 
822 	snprintf(name, sizeof(name), "%s %s errs", test_names[test],
823 		 cnt_names[cnt]);
824 
825 	return name;
826 }
827 
test_tcp_clear_dtime(struct test_tc_dtime * skel)828 static void test_tcp_clear_dtime(struct test_tc_dtime *skel)
829 {
830 	int i, t = TCP_IP6_CLEAR_DTIME;
831 	__u32 *dtimes = skel->bss->dtimes[t];
832 	__u32 *errs = skel->bss->errs[t];
833 
834 	skel->bss->test = t;
835 	test_inet_dtime(AF_INET6, SOCK_STREAM, IP6_DST, 50000 + t);
836 
837 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
838 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
839 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
840 		  dtime_cnt_str(t, INGRESS_FWDNS_P101));
841 	ASSERT_GT(dtimes[EGRESS_FWDNS_P100], 0,
842 		  dtime_cnt_str(t, EGRESS_FWDNS_P100));
843 	ASSERT_EQ(dtimes[EGRESS_FWDNS_P101], 0,
844 		  dtime_cnt_str(t, EGRESS_FWDNS_P101));
845 	ASSERT_GT(dtimes[EGRESS_ENDHOST], 0,
846 		  dtime_cnt_str(t, EGRESS_ENDHOST));
847 	ASSERT_GT(dtimes[INGRESS_ENDHOST], 0,
848 		  dtime_cnt_str(t, INGRESS_ENDHOST));
849 
850 	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
851 		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
852 }
853 
test_tcp_dtime(struct test_tc_dtime * skel,int family,bool bpf_fwd)854 static void test_tcp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
855 {
856 	__u32 *dtimes, *errs;
857 	const char *addr;
858 	int i, t;
859 
860 	if (family == AF_INET) {
861 		t = bpf_fwd ? TCP_IP4 : TCP_IP4_RT_FWD;
862 		addr = IP4_DST;
863 	} else {
864 		t = bpf_fwd ? TCP_IP6 : TCP_IP6_RT_FWD;
865 		addr = IP6_DST;
866 	}
867 
868 	dtimes = skel->bss->dtimes[t];
869 	errs = skel->bss->errs[t];
870 
871 	skel->bss->test = t;
872 	test_inet_dtime(family, SOCK_STREAM, addr, 50000 + t);
873 
874 	/* fwdns_prio100 prog does not read delivery_time_type, so
875 	 * kernel puts the (rcv) timestamp in __sk_buff->tstamp
876 	 */
877 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
878 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
879 	for (i = INGRESS_FWDNS_P101; i < SET_DTIME; i++)
880 		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
881 
882 	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
883 		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
884 }
885 
test_udp_dtime(struct test_tc_dtime * skel,int family,bool bpf_fwd)886 static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
887 {
888 	__u32 *dtimes, *errs;
889 	const char *addr;
890 	int i, t;
891 
892 	if (family == AF_INET) {
893 		t = bpf_fwd ? UDP_IP4 : UDP_IP4_RT_FWD;
894 		addr = IP4_DST;
895 	} else {
896 		t = bpf_fwd ? UDP_IP6 : UDP_IP6_RT_FWD;
897 		addr = IP6_DST;
898 	}
899 
900 	dtimes = skel->bss->dtimes[t];
901 	errs = skel->bss->errs[t];
902 
903 	skel->bss->test = t;
904 	test_inet_dtime(family, SOCK_DGRAM, addr, 50000 + t);
905 
906 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
907 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
908 	for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++)
909 		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
910 
911 	for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
912 		ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
913 }
914 
test_tc_redirect_dtime(struct netns_setup_result * setup_result)915 static void test_tc_redirect_dtime(struct netns_setup_result *setup_result)
916 {
917 	struct test_tc_dtime *skel;
918 	struct nstoken *nstoken;
919 	int hold_tstamp_fd, err;
920 
921 	/* Hold a sk with the SOCK_TIMESTAMP set to ensure there
922 	 * is no delay in the kernel net_enable_timestamp().
923 	 * This ensures the following tests must have
924 	 * non zero rcv tstamp in the recvmsg().
925 	 */
926 	hold_tstamp_fd = wait_netstamp_needed_key();
927 	if (!ASSERT_GE(hold_tstamp_fd, 0, "wait_netstamp_needed_key"))
928 		return;
929 
930 	skel = test_tc_dtime__open();
931 	if (!ASSERT_OK_PTR(skel, "test_tc_dtime__open"))
932 		goto done;
933 
934 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
935 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
936 
937 	err = test_tc_dtime__load(skel);
938 	if (!ASSERT_OK(err, "test_tc_dtime__load"))
939 		goto done;
940 
941 	if (netns_load_dtime_bpf(skel, setup_result))
942 		goto done;
943 
944 	nstoken = open_netns(NS_FWD);
945 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
946 		goto done;
947 	err = set_forwarding(false);
948 	close_netns(nstoken);
949 	if (!ASSERT_OK(err, "disable forwarding"))
950 		goto done;
951 
952 	test_tcp_clear_dtime(skel);
953 
954 	test_tcp_dtime(skel, AF_INET, true);
955 	test_tcp_dtime(skel, AF_INET6, true);
956 	test_udp_dtime(skel, AF_INET, true);
957 	test_udp_dtime(skel, AF_INET6, true);
958 
959 	/* Test the kernel ip[6]_forward path instead
960 	 * of bpf_redirect_neigh().
961 	 */
962 	nstoken = open_netns(NS_FWD);
963 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
964 		goto done;
965 	err = set_forwarding(true);
966 	close_netns(nstoken);
967 	if (!ASSERT_OK(err, "enable forwarding"))
968 		goto done;
969 
970 	test_tcp_dtime(skel, AF_INET, false);
971 	test_tcp_dtime(skel, AF_INET6, false);
972 	test_udp_dtime(skel, AF_INET, false);
973 	test_udp_dtime(skel, AF_INET6, false);
974 
975 done:
976 	test_tc_dtime__destroy(skel);
977 	close(hold_tstamp_fd);
978 }
979 
test_tc_redirect_neigh_fib(struct netns_setup_result * setup_result)980 static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
981 {
982 	struct nstoken *nstoken = NULL;
983 	struct test_tc_neigh_fib *skel = NULL;
984 
985 	nstoken = open_netns(NS_FWD);
986 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
987 		return;
988 
989 	skel = test_tc_neigh_fib__open();
990 	if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open"))
991 		goto done;
992 
993 	if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load"))
994 		goto done;
995 
996 	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
997 			   skel->progs.tc_chk, setup_result))
998 		goto done;
999 
1000 	/* bpf_fib_lookup() checks if forwarding is enabled */
1001 	if (!ASSERT_OK(set_forwarding(true), "enable forwarding"))
1002 		goto done;
1003 
1004 	test_connectivity();
1005 
1006 done:
1007 	if (skel)
1008 		test_tc_neigh_fib__destroy(skel);
1009 	close_netns(nstoken);
1010 }
1011 
test_tc_redirect_neigh(struct netns_setup_result * setup_result)1012 static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
1013 {
1014 	struct nstoken *nstoken = NULL;
1015 	struct test_tc_neigh *skel = NULL;
1016 	int err;
1017 
1018 	nstoken = open_netns(NS_FWD);
1019 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
1020 		return;
1021 
1022 	skel = test_tc_neigh__open();
1023 	if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open"))
1024 		goto done;
1025 
1026 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
1027 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1028 
1029 	err = test_tc_neigh__load(skel);
1030 	if (!ASSERT_OK(err, "test_tc_neigh__load"))
1031 		goto done;
1032 
1033 	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
1034 			   skel->progs.tc_chk, setup_result))
1035 		goto done;
1036 
1037 	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1038 		goto done;
1039 
1040 	test_connectivity();
1041 
1042 done:
1043 	if (skel)
1044 		test_tc_neigh__destroy(skel);
1045 	close_netns(nstoken);
1046 }
1047 
test_tc_redirect_peer(struct netns_setup_result * setup_result)1048 static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
1049 {
1050 	struct nstoken *nstoken;
1051 	struct test_tc_peer *skel;
1052 	int err;
1053 
1054 	nstoken = open_netns(NS_FWD);
1055 	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
1056 		return;
1057 
1058 	skel = test_tc_peer__open();
1059 	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
1060 		goto done;
1061 
1062 	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
1063 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1064 
1065 	err = test_tc_peer__load(skel);
1066 	if (!ASSERT_OK(err, "test_tc_peer__load"))
1067 		goto done;
1068 
1069 	if (netns_load_bpf(skel->progs.tc_src, skel->progs.tc_dst,
1070 			   skel->progs.tc_chk, setup_result))
1071 		goto done;
1072 
1073 	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1074 		goto done;
1075 
1076 	test_connectivity();
1077 
1078 done:
1079 	if (skel)
1080 		test_tc_peer__destroy(skel);
1081 	close_netns(nstoken);
1082 }
1083 
tun_open(char * name)1084 static int tun_open(char *name)
1085 {
1086 	struct ifreq ifr;
1087 	int fd, err;
1088 
1089 	fd = open("/dev/net/tun", O_RDWR);
1090 	if (!ASSERT_GE(fd, 0, "open /dev/net/tun"))
1091 		return -1;
1092 
1093 	memset(&ifr, 0, sizeof(ifr));
1094 
1095 	ifr.ifr_flags = IFF_TUN | IFF_NO_PI;
1096 	if (*name)
1097 		strncpy(ifr.ifr_name, name, IFNAMSIZ);
1098 
1099 	err = ioctl(fd, TUNSETIFF, &ifr);
1100 	if (!ASSERT_OK(err, "ioctl TUNSETIFF"))
1101 		goto fail;
1102 
1103 	SYS(fail, "ip link set dev %s up", name);
1104 
1105 	return fd;
1106 fail:
1107 	close(fd);
1108 	return -1;
1109 }
1110 
1111 enum {
1112 	SRC_TO_TARGET = 0,
1113 	TARGET_TO_SRC = 1,
1114 };
1115 
tun_relay_loop(int src_fd,int target_fd)1116 static int tun_relay_loop(int src_fd, int target_fd)
1117 {
1118 	fd_set rfds, wfds;
1119 
1120 	FD_ZERO(&rfds);
1121 	FD_ZERO(&wfds);
1122 
1123 	for (;;) {
1124 		char buf[1500];
1125 		int direction, nread, nwrite;
1126 
1127 		FD_SET(src_fd, &rfds);
1128 		FD_SET(target_fd, &rfds);
1129 
1130 		if (select(1 + MAX(src_fd, target_fd), &rfds, NULL, NULL, NULL) < 0) {
1131 			log_err("select failed");
1132 			return 1;
1133 		}
1134 
1135 		direction = FD_ISSET(src_fd, &rfds) ? SRC_TO_TARGET : TARGET_TO_SRC;
1136 
1137 		nread = read(direction == SRC_TO_TARGET ? src_fd : target_fd, buf, sizeof(buf));
1138 		if (nread < 0) {
1139 			log_err("read failed");
1140 			return 1;
1141 		}
1142 
1143 		nwrite = write(direction == SRC_TO_TARGET ? target_fd : src_fd, buf, nread);
1144 		if (nwrite != nread) {
1145 			log_err("write failed");
1146 			return 1;
1147 		}
1148 	}
1149 }
1150 
test_tc_redirect_peer_l3(struct netns_setup_result * setup_result)1151 static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result)
1152 {
1153 	LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd);
1154 	LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd);
1155 	struct test_tc_peer *skel = NULL;
1156 	struct nstoken *nstoken = NULL;
1157 	int err;
1158 	int tunnel_pid = -1;
1159 	int src_fd, target_fd = -1;
1160 	int ifindex;
1161 
1162 	/* Start a L3 TUN/TAP tunnel between the src and dst namespaces.
1163 	 * This test is using TUN/TAP instead of e.g. IPIP or GRE tunnel as those
1164 	 * expose the L2 headers encapsulating the IP packet to BPF and hence
1165 	 * don't have skb in suitable state for this test. Alternative to TUN/TAP
1166 	 * would be e.g. Wireguard which would appear as a pure L3 device to BPF,
1167 	 * but that requires much more complicated setup.
1168 	 */
1169 	nstoken = open_netns(NS_SRC);
1170 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
1171 		return;
1172 
1173 	src_fd = tun_open("tun_src");
1174 	if (!ASSERT_GE(src_fd, 0, "tun_open tun_src"))
1175 		goto fail;
1176 
1177 	close_netns(nstoken);
1178 
1179 	nstoken = open_netns(NS_FWD);
1180 	if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
1181 		goto fail;
1182 
1183 	target_fd = tun_open("tun_fwd");
1184 	if (!ASSERT_GE(target_fd, 0, "tun_open tun_fwd"))
1185 		goto fail;
1186 
1187 	tunnel_pid = fork();
1188 	if (!ASSERT_GE(tunnel_pid, 0, "fork tun_relay_loop"))
1189 		goto fail;
1190 
1191 	if (tunnel_pid == 0)
1192 		exit(tun_relay_loop(src_fd, target_fd));
1193 
1194 	skel = test_tc_peer__open();
1195 	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
1196 		goto fail;
1197 
1198 	ifindex = if_nametoindex("tun_fwd");
1199 	if (!ASSERT_GT(ifindex, 0, "if_indextoname tun_fwd"))
1200 		goto fail;
1201 
1202 	skel->rodata->IFINDEX_SRC = ifindex;
1203 	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
1204 
1205 	err = test_tc_peer__load(skel);
1206 	if (!ASSERT_OK(err, "test_tc_peer__load"))
1207 		goto fail;
1208 
1209 	/* Load "tc_src_l3" to the tun_fwd interface to redirect packets
1210 	 * towards dst, and "tc_dst" to redirect packets
1211 	 * and "tc_chk" on dst_fwd to drop non-redirected packets.
1212 	 */
1213 	/* tc qdisc add dev tun_fwd clsact */
1214 	QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex);
1215 	/* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */
1216 	XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0);
1217 
1218 	/* tc qdisc add dev dst_fwd clsact */
1219 	QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd);
1220 	/* tc filter add dev dst_fwd ingress bpf da tc_dst_l3 */
1221 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0);
1222 	/* tc filter add dev dst_fwd egress bpf da tc_chk */
1223 	XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0);
1224 
1225 	/* Setup route and neigh tables */
1226 	SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24");
1227 	SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP4_TUN_FWD "/24");
1228 
1229 	SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad");
1230 	SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad");
1231 
1232 	SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev src scope global");
1233 	SYS(fail, "ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD
1234 	    " dev tun_src scope global");
1235 	SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev dst scope global");
1236 	SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev src scope global");
1237 	SYS(fail, "ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD
1238 	    " dev tun_src scope global");
1239 	SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev dst scope global");
1240 
1241 	SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev dst lladdr " MAC_DST_FWD);
1242 	SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev dst lladdr " MAC_DST_FWD);
1243 
1244 	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
1245 		goto fail;
1246 
1247 	test_connectivity();
1248 
1249 fail:
1250 	if (tunnel_pid > 0) {
1251 		kill(tunnel_pid, SIGTERM);
1252 		waitpid(tunnel_pid, NULL, 0);
1253 	}
1254 	if (src_fd >= 0)
1255 		close(src_fd);
1256 	if (target_fd >= 0)
1257 		close(target_fd);
1258 	if (skel)
1259 		test_tc_peer__destroy(skel);
1260 	if (nstoken)
1261 		close_netns(nstoken);
1262 }
1263 
1264 #define RUN_TEST(name, mode)                                                                \
1265 	({                                                                                  \
1266 		struct netns_setup_result setup_result = { .dev_mode = mode, };             \
1267 		if (test__start_subtest(#name))                                             \
1268 			if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \
1269 				if (ASSERT_OK(netns_setup_links_and_routes(&setup_result),  \
1270 					      "setup links and routes"))                    \
1271 					test_ ## name(&setup_result);                       \
1272 				netns_setup_namespaces("delete");                           \
1273 			}                                                                   \
1274 	})
1275 
test_tc_redirect_run_tests(void * arg)1276 static void *test_tc_redirect_run_tests(void *arg)
1277 {
1278 	netns_setup_namespaces_nofail("delete");
1279 
1280 	RUN_TEST(tc_redirect_peer, MODE_VETH);
1281 	RUN_TEST(tc_redirect_peer, MODE_NETKIT);
1282 	RUN_TEST(tc_redirect_peer_l3, MODE_VETH);
1283 	RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT);
1284 	RUN_TEST(tc_redirect_neigh, MODE_VETH);
1285 	RUN_TEST(tc_redirect_neigh_fib, MODE_VETH);
1286 	RUN_TEST(tc_redirect_dtime, MODE_VETH);
1287 	return NULL;
1288 }
1289 
test_tc_redirect(void)1290 void test_tc_redirect(void)
1291 {
1292 	pthread_t test_thread;
1293 	int err;
1294 
1295 	/* Run the tests in their own thread to isolate the namespace changes
1296 	 * so they do not affect the environment of other tests.
1297 	 * (specifically needed because of unshare(CLONE_NEWNS) in open_netns())
1298 	 */
1299 	err = pthread_create(&test_thread, NULL, &test_tc_redirect_run_tests, NULL);
1300 	if (ASSERT_OK(err, "pthread_create"))
1301 		ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");
1302 }
1303