1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/module.h> 4 #include <linux/init.h> 5 #include <linux/netlink.h> 6 #include <linux/netfilter.h> 7 #include <linux/workqueue.h> 8 #include <linux/spinlock.h> 9 #include <linux/netfilter/nf_conntrack_common.h> 10 #include <linux/netfilter/nf_tables.h> 11 #include <net/ip.h> /* for ipv4 options. */ 12 #include <net/inet_dscp.h> 13 #include <net/netfilter/nf_tables.h> 14 #include <net/netfilter/nf_tables_core.h> 15 #include <net/netfilter/nf_conntrack_core.h> 16 #include <net/netfilter/nf_conntrack_extend.h> 17 #include <net/netfilter/nf_flow_table.h> 18 19 struct nft_flow_offload { 20 struct nft_flowtable *flowtable; 21 }; 22 nft_xmit_type(struct dst_entry * dst)23 static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) 24 { 25 if (dst_xfrm(dst)) 26 return FLOW_OFFLOAD_XMIT_XFRM; 27 28 return FLOW_OFFLOAD_XMIT_NEIGH; 29 } 30 nft_default_forward_path(struct nf_flow_route * route,struct dst_entry * dst_cache,enum ip_conntrack_dir dir)31 static void nft_default_forward_path(struct nf_flow_route *route, 32 struct dst_entry *dst_cache, 33 enum ip_conntrack_dir dir) 34 { 35 route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; 36 route->tuple[dir].dst = dst_cache; 37 route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); 38 } 39 nft_is_valid_ether_device(const struct net_device * dev)40 static bool nft_is_valid_ether_device(const struct net_device *dev) 41 { 42 if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || 43 dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) 44 return false; 45 46 return true; 47 } 48 nft_dev_fill_forward_path(const struct nf_flow_route * route,const struct dst_entry * dst_cache,const struct nf_conn * ct,enum ip_conntrack_dir dir,u8 * ha,struct net_device_path_stack * stack)49 static int nft_dev_fill_forward_path(const struct nf_flow_route *route, 50 const struct dst_entry *dst_cache, 51 const struct nf_conn *ct, 52 enum ip_conntrack_dir dir, u8 *ha, 53 struct net_device_path_stack *stack) 54 { 55 const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; 56 struct net_device *dev = dst_cache->dev; 57 struct neighbour *n; 58 u8 nud_state; 59 60 if (!nft_is_valid_ether_device(dev)) 61 goto out; 62 63 n = dst_neigh_lookup(dst_cache, daddr); 64 if (!n) 65 return -1; 66 67 read_lock_bh(&n->lock); 68 nud_state = n->nud_state; 69 ether_addr_copy(ha, n->ha); 70 read_unlock_bh(&n->lock); 71 neigh_release(n); 72 73 if (!(nud_state & NUD_VALID)) 74 return -1; 75 76 out: 77 return dev_fill_forward_path(dev, ha, stack); 78 } 79 80 struct nft_forward_info { 81 const struct net_device *indev; 82 const struct net_device *outdev; 83 const struct net_device *hw_outdev; 84 struct id { 85 __u16 id; 86 __be16 proto; 87 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 88 u8 num_encaps; 89 u8 ingress_vlans; 90 u8 h_source[ETH_ALEN]; 91 u8 h_dest[ETH_ALEN]; 92 enum flow_offload_xmit_type xmit_type; 93 }; 94 nft_dev_path_info(const struct net_device_path_stack * stack,struct nft_forward_info * info,unsigned char * ha,struct nf_flowtable * flowtable)95 static void nft_dev_path_info(const struct net_device_path_stack *stack, 96 struct nft_forward_info *info, 97 unsigned char *ha, struct nf_flowtable *flowtable) 98 { 99 const struct net_device_path *path; 100 int i; 101 102 memcpy(info->h_dest, ha, ETH_ALEN); 103 104 for (i = 0; i < stack->num_paths; i++) { 105 path = &stack->path[i]; 106 switch (path->type) { 107 case DEV_PATH_ETHERNET: 108 case DEV_PATH_DSA: 109 case DEV_PATH_VLAN: 110 case DEV_PATH_PPPOE: 111 info->indev = path->dev; 112 if (is_zero_ether_addr(info->h_source)) 113 memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 114 115 if (path->type == DEV_PATH_ETHERNET) 116 break; 117 if (path->type == DEV_PATH_DSA) { 118 i = stack->num_paths; 119 break; 120 } 121 122 /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ 123 if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 124 info->indev = NULL; 125 break; 126 } 127 if (!info->outdev) 128 info->outdev = path->dev; 129 info->encap[info->num_encaps].id = path->encap.id; 130 info->encap[info->num_encaps].proto = path->encap.proto; 131 info->num_encaps++; 132 if (path->type == DEV_PATH_PPPOE) 133 memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 134 break; 135 case DEV_PATH_BRIDGE: 136 if (is_zero_ether_addr(info->h_source)) 137 memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 138 139 switch (path->bridge.vlan_mode) { 140 case DEV_PATH_BR_VLAN_UNTAG_HW: 141 info->ingress_vlans |= BIT(info->num_encaps - 1); 142 break; 143 case DEV_PATH_BR_VLAN_TAG: 144 info->encap[info->num_encaps].id = path->bridge.vlan_id; 145 info->encap[info->num_encaps].proto = path->bridge.vlan_proto; 146 info->num_encaps++; 147 break; 148 case DEV_PATH_BR_VLAN_UNTAG: 149 info->num_encaps--; 150 break; 151 case DEV_PATH_BR_VLAN_KEEP: 152 break; 153 } 154 info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 155 break; 156 default: 157 info->indev = NULL; 158 break; 159 } 160 } 161 if (!info->outdev) 162 info->outdev = info->indev; 163 164 info->hw_outdev = info->indev; 165 166 if (nf_flowtable_hw_offload(flowtable) && 167 nft_is_valid_ether_device(info->indev)) 168 info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 169 } 170 nft_flowtable_find_dev(const struct net_device * dev,struct nft_flowtable * ft)171 static bool nft_flowtable_find_dev(const struct net_device *dev, 172 struct nft_flowtable *ft) 173 { 174 struct nft_hook *hook; 175 bool found = false; 176 177 list_for_each_entry_rcu(hook, &ft->hook_list, list) { 178 if (hook->ops.dev != dev) 179 continue; 180 181 found = true; 182 break; 183 } 184 185 return found; 186 } 187 nft_dev_forward_path(struct nf_flow_route * route,const struct nf_conn * ct,enum ip_conntrack_dir dir,struct nft_flowtable * ft)188 static void nft_dev_forward_path(struct nf_flow_route *route, 189 const struct nf_conn *ct, 190 enum ip_conntrack_dir dir, 191 struct nft_flowtable *ft) 192 { 193 const struct dst_entry *dst = route->tuple[dir].dst; 194 struct net_device_path_stack stack; 195 struct nft_forward_info info = {}; 196 unsigned char ha[ETH_ALEN]; 197 int i; 198 199 if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) 200 nft_dev_path_info(&stack, &info, ha, &ft->data); 201 202 if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) 203 return; 204 205 route->tuple[!dir].in.ifindex = info.indev->ifindex; 206 for (i = 0; i < info.num_encaps; i++) { 207 route->tuple[!dir].in.encap[i].id = info.encap[i].id; 208 route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 209 } 210 route->tuple[!dir].in.num_encaps = info.num_encaps; 211 route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 212 213 if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { 214 memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); 215 memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); 216 route->tuple[dir].out.ifindex = info.outdev->ifindex; 217 route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; 218 route->tuple[dir].xmit_type = info.xmit_type; 219 } 220 } 221 nft_flow_route(const struct nft_pktinfo * pkt,const struct nf_conn * ct,struct nf_flow_route * route,enum ip_conntrack_dir dir,struct nft_flowtable * ft)222 static int nft_flow_route(const struct nft_pktinfo *pkt, 223 const struct nf_conn *ct, 224 struct nf_flow_route *route, 225 enum ip_conntrack_dir dir, 226 struct nft_flowtable *ft) 227 { 228 struct dst_entry *this_dst = skb_dst(pkt->skb); 229 struct dst_entry *other_dst = NULL; 230 struct flowi fl; 231 232 memset(&fl, 0, sizeof(fl)); 233 switch (nft_pf(pkt)) { 234 case NFPROTO_IPV4: 235 fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; 236 fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; 237 fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; 238 fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; 239 fl.u.ip4.flowi4_tos = ip_hdr(pkt->skb)->tos & INET_DSCP_MASK; 240 fl.u.ip4.flowi4_mark = pkt->skb->mark; 241 fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 242 break; 243 case NFPROTO_IPV6: 244 fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; 245 fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; 246 fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; 247 fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; 248 fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 249 fl.u.ip6.flowi6_mark = pkt->skb->mark; 250 fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 251 break; 252 } 253 254 if (!dst_hold_safe(this_dst)) 255 return -ENOENT; 256 257 nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); 258 if (!other_dst) { 259 dst_release(this_dst); 260 return -ENOENT; 261 } 262 263 nft_default_forward_path(route, this_dst, dir); 264 nft_default_forward_path(route, other_dst, !dir); 265 266 if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && 267 route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { 268 nft_dev_forward_path(route, ct, dir, ft); 269 nft_dev_forward_path(route, ct, !dir, ft); 270 } 271 272 return 0; 273 } 274 nft_flow_offload_skip(struct sk_buff * skb,int family)275 static bool nft_flow_offload_skip(struct sk_buff *skb, int family) 276 { 277 if (skb_sec_path(skb)) 278 return true; 279 280 if (family == NFPROTO_IPV4) { 281 const struct ip_options *opt; 282 283 opt = &(IPCB(skb)->opt); 284 285 if (unlikely(opt->optlen)) 286 return true; 287 } 288 289 return false; 290 } 291 nft_flow_offload_eval(const struct nft_expr * expr,struct nft_regs * regs,const struct nft_pktinfo * pkt)292 static void nft_flow_offload_eval(const struct nft_expr *expr, 293 struct nft_regs *regs, 294 const struct nft_pktinfo *pkt) 295 { 296 struct nft_flow_offload *priv = nft_expr_priv(expr); 297 struct nf_flowtable *flowtable = &priv->flowtable->data; 298 struct tcphdr _tcph, *tcph = NULL; 299 struct nf_flow_route route = {}; 300 enum ip_conntrack_info ctinfo; 301 struct flow_offload *flow; 302 enum ip_conntrack_dir dir; 303 struct nf_conn *ct; 304 int ret; 305 306 if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) 307 goto out; 308 309 ct = nf_ct_get(pkt->skb, &ctinfo); 310 if (!ct) 311 goto out; 312 313 switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { 314 case IPPROTO_TCP: 315 tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), 316 sizeof(_tcph), &_tcph); 317 if (unlikely(!tcph || tcph->fin || tcph->rst || 318 !nf_conntrack_tcp_established(ct))) 319 goto out; 320 break; 321 case IPPROTO_UDP: 322 break; 323 #ifdef CONFIG_NF_CT_PROTO_GRE 324 case IPPROTO_GRE: { 325 struct nf_conntrack_tuple *tuple; 326 327 if (ct->status & IPS_NAT_MASK) 328 goto out; 329 tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 330 /* No support for GRE v1 */ 331 if (tuple->src.u.gre.key || tuple->dst.u.gre.key) 332 goto out; 333 break; 334 } 335 #endif 336 default: 337 goto out; 338 } 339 340 if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || 341 ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) 342 goto out; 343 344 if (!nf_ct_is_confirmed(ct)) 345 goto out; 346 347 if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) 348 goto out; 349 350 dir = CTINFO2DIR(ctinfo); 351 if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) 352 goto err_flow_route; 353 354 flow = flow_offload_alloc(ct); 355 if (!flow) 356 goto err_flow_alloc; 357 358 flow_offload_route_init(flow, &route); 359 360 if (tcph) { 361 ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 362 ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 363 } 364 365 __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); 366 ret = flow_offload_add(flowtable, flow); 367 if (ret < 0) 368 goto err_flow_add; 369 370 return; 371 372 err_flow_add: 373 flow_offload_free(flow); 374 err_flow_alloc: 375 dst_release(route.tuple[dir].dst); 376 dst_release(route.tuple[!dir].dst); 377 err_flow_route: 378 clear_bit(IPS_OFFLOAD_BIT, &ct->status); 379 out: 380 regs->verdict.code = NFT_BREAK; 381 } 382 nft_flow_offload_validate(const struct nft_ctx * ctx,const struct nft_expr * expr)383 static int nft_flow_offload_validate(const struct nft_ctx *ctx, 384 const struct nft_expr *expr) 385 { 386 unsigned int hook_mask = (1 << NF_INET_FORWARD); 387 388 if (ctx->family != NFPROTO_IPV4 && 389 ctx->family != NFPROTO_IPV6 && 390 ctx->family != NFPROTO_INET) 391 return -EOPNOTSUPP; 392 393 return nft_chain_validate_hooks(ctx->chain, hook_mask); 394 } 395 396 static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { 397 [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, 398 .len = NFT_NAME_MAXLEN - 1 }, 399 }; 400 nft_flow_offload_init(const struct nft_ctx * ctx,const struct nft_expr * expr,const struct nlattr * const tb[])401 static int nft_flow_offload_init(const struct nft_ctx *ctx, 402 const struct nft_expr *expr, 403 const struct nlattr * const tb[]) 404 { 405 struct nft_flow_offload *priv = nft_expr_priv(expr); 406 u8 genmask = nft_genmask_next(ctx->net); 407 struct nft_flowtable *flowtable; 408 409 if (!tb[NFTA_FLOW_TABLE_NAME]) 410 return -EINVAL; 411 412 flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], 413 genmask); 414 if (IS_ERR(flowtable)) 415 return PTR_ERR(flowtable); 416 417 if (!nft_use_inc(&flowtable->use)) 418 return -EMFILE; 419 420 priv->flowtable = flowtable; 421 422 return nf_ct_netns_get(ctx->net, ctx->family); 423 } 424 nft_flow_offload_deactivate(const struct nft_ctx * ctx,const struct nft_expr * expr,enum nft_trans_phase phase)425 static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, 426 const struct nft_expr *expr, 427 enum nft_trans_phase phase) 428 { 429 struct nft_flow_offload *priv = nft_expr_priv(expr); 430 431 nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); 432 } 433 nft_flow_offload_activate(const struct nft_ctx * ctx,const struct nft_expr * expr)434 static void nft_flow_offload_activate(const struct nft_ctx *ctx, 435 const struct nft_expr *expr) 436 { 437 struct nft_flow_offload *priv = nft_expr_priv(expr); 438 439 nft_use_inc_restore(&priv->flowtable->use); 440 } 441 nft_flow_offload_destroy(const struct nft_ctx * ctx,const struct nft_expr * expr)442 static void nft_flow_offload_destroy(const struct nft_ctx *ctx, 443 const struct nft_expr *expr) 444 { 445 nf_ct_netns_put(ctx->net, ctx->family); 446 } 447 nft_flow_offload_dump(struct sk_buff * skb,const struct nft_expr * expr,bool reset)448 static int nft_flow_offload_dump(struct sk_buff *skb, 449 const struct nft_expr *expr, bool reset) 450 { 451 struct nft_flow_offload *priv = nft_expr_priv(expr); 452 453 if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) 454 goto nla_put_failure; 455 456 return 0; 457 458 nla_put_failure: 459 return -1; 460 } 461 462 static struct nft_expr_type nft_flow_offload_type; 463 static const struct nft_expr_ops nft_flow_offload_ops = { 464 .type = &nft_flow_offload_type, 465 .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), 466 .eval = nft_flow_offload_eval, 467 .init = nft_flow_offload_init, 468 .activate = nft_flow_offload_activate, 469 .deactivate = nft_flow_offload_deactivate, 470 .destroy = nft_flow_offload_destroy, 471 .validate = nft_flow_offload_validate, 472 .dump = nft_flow_offload_dump, 473 .reduce = NFT_REDUCE_READONLY, 474 }; 475 476 static struct nft_expr_type nft_flow_offload_type __read_mostly = { 477 .name = "flow_offload", 478 .ops = &nft_flow_offload_ops, 479 .policy = nft_flow_offload_policy, 480 .maxattr = NFTA_FLOW_MAX, 481 .owner = THIS_MODULE, 482 }; 483 flow_offload_netdev_event(struct notifier_block * this,unsigned long event,void * ptr)484 static int flow_offload_netdev_event(struct notifier_block *this, 485 unsigned long event, void *ptr) 486 { 487 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 488 489 if (event != NETDEV_DOWN) 490 return NOTIFY_DONE; 491 492 nf_flow_table_cleanup(dev); 493 494 return NOTIFY_DONE; 495 } 496 497 static struct notifier_block flow_offload_netdev_notifier = { 498 .notifier_call = flow_offload_netdev_event, 499 }; 500 nft_flow_offload_module_init(void)501 static int __init nft_flow_offload_module_init(void) 502 { 503 int err; 504 505 err = register_netdevice_notifier(&flow_offload_netdev_notifier); 506 if (err) 507 goto err; 508 509 err = nft_register_expr(&nft_flow_offload_type); 510 if (err < 0) 511 goto register_expr; 512 513 return 0; 514 515 register_expr: 516 unregister_netdevice_notifier(&flow_offload_netdev_notifier); 517 err: 518 return err; 519 } 520 nft_flow_offload_module_exit(void)521 static void __exit nft_flow_offload_module_exit(void) 522 { 523 nft_unregister_expr(&nft_flow_offload_type); 524 unregister_netdevice_notifier(&flow_offload_netdev_notifier); 525 } 526 527 module_init(nft_flow_offload_module_init); 528 module_exit(nft_flow_offload_module_exit); 529 530 MODULE_LICENSE("GPL"); 531 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 532 MODULE_ALIAS_NFT_EXPR("flow_offload"); 533 MODULE_DESCRIPTION("nftables hardware flow offload module"); 534