Presentation is loading. Please wait.

Presentation is loading. Please wait.

Linux Network Architecture Network Layer Isaac Y. Tsai.

Similar presentations


Presentation on theme: "Linux Network Architecture Network Layer Isaac Y. Tsai."— Presentation transcript:

1 Linux Network Architecture Network Layer Isaac Y. Tsai

2 2010/09/17 © by Outline Network Layer in Linux Network filter and iptable framework PF Ring architecture

3 2010/09/17 © by Interface between device driver and network layer

4 2010/09/17 © by Network layer functions /net/ipv4/ip_input.c ip_rcv(skb) ip_rcv_finish(skb) ip_local_deliver(skb) ip_local_deliver_finish(skb) /net/ipv4/ip_forward.c ip_forward(skb) ip_forward_finish(skb) /net/ipv4/ipmr.c int ip_mr_input(skb) /net/ipv4/ip_output.c ip_queue_xmit(skb,ipfragok) ip_local_out(skb) __ip_local_out(skb) ip_output(skb) ip_finish_output(skb) ip_finish_output2(skb) ip_mc_output(skb)

5 2010/09/17 © by netif_receive_skb() /net/core/dev.c int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; struct net_device *orig_dev, *master, *null_or_orig, *null_or_bond; int ret = NET_RX_DROP; __be16 type; if (!skb->tstamp.tv64) net_timestamp(skb); if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; if (netpoll_receive_skb(skb)) return NET_RX_DROP; if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex; null_or_orig = NULL; orig_dev = skb->dev; master = ACCESS_ONCE(orig_dev->master);

6 2010/09/17 © by netif_receive_skb() (cont’ed) if (master) { if (skb_bond_should_drop(skb, master)) null_or_orig = orig_dev; else skb->dev = master; } __get_cpu_var(netdev_rx_stat).total++; skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->mac_len = skb->network_header - skb->mac_header; pt_prev = NULL; rcu_read_lock(); #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb- >tc_verd); goto ncls; } #endif list_for_each_entry_rcu(ptype, &ptype_all, list) { if (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; }

7 2010/09/17 © by netif_receive_skb() (cont’ed) #ifdef CONFIG_NET_CLS_ACT skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; ncls: #endif skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; null_or_bond = NULL; if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { null_or_bond = vlan_dev_real_dev(skb->dev); } type = skb->protocol;

8 2010/09/17 © by netif_receive_skb() (cont’ed) list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev || ptype->dev == null_or_bond)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } if (pt_prev) { ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { kfree_skb(skb); ret = NET_RX_DROP; } out: rcu_read_unlock(); return ret; }

9 2010/09/17 © by net_rx_action() /net/core/dev.c static void net_rx_action(struct softirq_action *h) { struct list_head *list = &__get_cpu_var(softnet_data).poll_list; unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; local_irq_disable(); while (!list_empty(list)) { struct napi_struct *n; int work, weight; if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) goto softnet_break; local_irq_enable(); n = list_first_entry(list, struct napi_struct, poll_list);

10 2010/09/17 © by net_rx_action() (cont’ed) have = netpoll_poll_lock(n); weight = n->weight; work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); trace_napi_poll(n); } WARN_ON_ONCE(work > weight); budget -= work; local_irq_disable(); if (unlikely(work == weight)) { if (unlikely(napi_disable_pending(n))) { local_irq_enable(); napi_complete(n); local_irq_disable(); } else list_move_tail(&n->poll_list, list); } netpoll_poll_unlock(have); }

11 2010/09/17 © by net_rx_action() (cont’ed) out: local_irq_enable(); #ifdef CONFIG_NET_DMA /* * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */ dma_issue_pending_all(); #endif return; softnet_break: __get_cpu_var(netdev_rx_stat).time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; }

12 2010/09/17 © by Packet reception path: ip_rcv() Network layer packet reception code ip_rcv() ip_rcv() first performs some error checking related to packet type, packet header and it keeps some packet statistics. At the end of the code, it makes a macro function call to NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

13 2010/09/17 © by ip_rcv() /net/ipv4/ip_input.c int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct iphdr *iph; u32 len; if (skb->pkt_type == PACKET_OTHERHOST)goto drop; IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto out; } if (!pskb_may_pull(skb, sizeof(struct iphdr)))goto inhdr_error;

14 2010/09/17 © by ip_rcv() (cont’ed) iph = ip_hdr(skb); if (iph->ihl version != 4)goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4))goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))goto inhdr_error; len = ntohs(iph->tot_len); if (skb->len < len) { IP_INC_STATS_BH(dev_net(dev),IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len ihl*4))goto inhdr_error;

15 2010/09/17 © by ip_rcv() (cont’ed) if (pskb_trim_rcsum(skb, len)) { IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); /* Must drop socket now because of tproxy. */ skb_orphan(skb); return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); inhdr_error: IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: return NET_RX_DROP; }

16 2010/09/17 © by ip_rcv_finish() ip_rcv_finish() calls ip_route_input() The skb->dst pointer of the socket buffer is set to an entry in the routing cache, which stores not only the destination on the IP level, but also a pointer to an entry in the hard header cache (cache for layer-2 frame packet headers), if present. If ip_route_input() cannot find a route, then the packet is discarded. Finally in ip_rcv_finish(), the procedure of the IP protocol reaches the junction between packets addressed to the local computer and packets to be forwarded. The information about the further path of an IP packet is stored in the routing entry skb->dst. Notice that a trick often used in the Linux kernel is used here. If a switch (variable value) is used to select different functions, then we simply insert a pointer to each of these functions. This saves us an if or switch instruction for each decision of how the program should continue. In the example used here, the pointer skb->dst->input() points to the function that should be used to handle a packet further:

17 2010/09/17 © by The pointer skb->dst->input() points to the function that should be used to handle a packet further: ip_local_deliver() is entered in the case of unicast and multicast packets that should be delivered to the local computer. ip_forward() handles all unicast packets that should be forwarded. ip_mr_input() is used for multicast packets that should be forwarded.

18 2010/09/17 © by ip_rcv_finish(skb) /net/ipv4/ip_input.c static int ip_rcv_finish(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; if (skb_dst(skb) == NULL) { int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INADDRERRORS); else if (err == -ENETUNREACH) IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INNOROUTES); goto drop; }

19 2010/09/17 © by ip_rcv_finish(skb) (cont’ed) #ifdef CONFIG_NET_CLS_ROUTE if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb))goto drop; rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); drop: kfree_skb(skb); return NET_RX_DROP; }

20 2010/09/17 © by ip_local_deliver(skb) /net/ipv4/ip_input.c /* Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* Reassemble IP fragments. */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish); }

21 2010/09/17 © by ip_local_deliver_finish(skb) /net/ipv4/ip_input.c static int ip_local_deliver_finish(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); __skb_pull(skb, ip_hdrlen(skb)); /* Point into the IP datagram, just past the header. */ skb_reset_transport_header(skb); rcu_read_lock(); { int protocol = ip_hdr(skb)->protocol; int hash, raw; const struct net_protocol *ipprot; resubmit: raw = raw_local_deliver(skb, protocol); hash = protocol & (MAX_INET_PROTOS - 1); ipprot = rcu_dereference(inet_protos[hash]);

22 2010/09/17 © by ip_local_deliver_finish(skb) (cont’ed) if (ipprot != NULL) { int ret; if (!net_eq(net, &init_net) && !ipprot->netns_ok) { if (net_ratelimit()) printk("%s: proto %d isn't netns-ready\n", __func__, protocol); kfree_skb(skb); goto out; } if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb(skb); goto out; } nf_reset(skb); } ret = ipprot->handler(skb); if (ret < 0) { protocol = -ret; goto resubmit; } IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); } else {

23 2010/09/17 © by ip_local_deliver_finish(skb) (cont’ed) if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } } else IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); kfree_skb(skb); } out: rcu_read_unlock(); return 0; }

24 2010/09/17 © by dst_input(skb) static inline int dst_input(struct sk_buff *skb) { return skb_dst(skb)->input(skb); } static inline struct dst_entry *skb_dst(const struct sk_buff *skb){ return (struct dst_entry *)skb->_skb_dst; } static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst){ skb->_skb_dst = (unsigned long)dst; }

25 2010/09/17 © by dst_output(skb) /* Output packet to network from transport. */ static inline int dst_output(struct sk_buff *skb){ return skb_dst(skb)->output(skb); }

26 2010/09/17 © by struct dst_entry struct dst_entry { struct rcu_headrcu_head; struct dst_entry*child; struct net_device *dev; shorterror, obsolete; intflags; unsigned longexpires; unsigned shortheader_len, trailer_len; /* space to reserve at tail */ unsigned intrate_tokens; unsigned longrate_last; /* rate limiting for ICMP */ struct dst_entry*path; struct neighbour*neighbour; struct hh_cache*hh; #ifdef CONFIG_XFRM struct xfrm_state*xfrm; #else

27 2010/09/17 © by struct dst_entry (cont’ed) void*__pad1; #endif int(*input)(struct sk_buff*); int(*output)(struct sk_buff*); struct dst_ops*ops; u32metrics[RTAX_MAX]; #ifdef CONFIG_NET_CLS_ROUTE __u32tclassid; #else __u32__pad2; #endif /* Align __refcnt to a 64 bytes alignment */ #ifdef CONFIG_64BIT long__pad_to_align_refcnt[1]; #endif

28 2010/09/17 © by struct dst_entry (cont’ed) /* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */ atomic_t__refcnt;/* client references*/ int__use; unsigned longlastuse; union { struct dst_entry *next; struct rtable *rt_next; struct rt6_info *rt6_next; struct dn_route *dn_next; };

29 2010/09/17 © by ip_forward(skb) The primary task of ip_forward(skb) is to process a few conditions of the Internet Protocol (e.g., a packet's lifetime) and packet options. First, packets not marked with pkt_type == PACKET_HOST are deleted. Next, the reach of the packet is checked. If the value in its TTL field is 1 (before it is decremented), then the packet is deleted. RFC 791 specifies that, if such an action occurs, an ICMP packet has to be returned to the sender to inform the latter (ICMP_TIME_EXCEEDED). Once a redirect message has been checked, if applicable, the socket buffer is checked to see if there is sufficient memory for the headroom. This means that the function skb_cow(skb, headroom) is used to check whether there is still sufficient space for the MAC header in the output network device (out_dev->hard_header_len). If this is not the case, then skb_realloc_headroom() creates sufficient space. Subsequently, the TTL field of the IP packet is decremented by one. When the actual packet length (including the MAC header) is known, it is checked for whether it really fits into the frame format of the new output network device. If it is too long (skb->len > mtu), and if no fragmenting is allowed because the Don't-Fragment bit is set in the IP header, then the packet is discarded, and the ICMP message ICMP_FRAG_NEEDED is transmitted to the sender. In any case, the packet is not fragmented yet; fragmenting is delayed. The early test for such cases prevents potential Don't-Fragment candidates from running through the entire IP protocol- handling process, only to be dropped eventually.

30 2010/09/17 © by ip_forward(skb) /net/ipv4/ip_forward.c int ip_forward(struct sk_buff *skb) { struct iphdr *iph;/* Our header */ struct rtable *rt;/* Route we use */ struct ip_options * opt= &(IPCB(skb)->opt); if (skb_warn_if_lro(skb))goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) goto drop; if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS; if (skb->pkt_type != PACKET_HOST) goto drop; skb_forward_csum(skb); /* According to the RFC, we must first decrease the TTL field. If that reaches zero, we must reply an ICMP control message telling that the packet's lifetime expired. */ if (ip_hdr(skb)->ttl <= 1)goto too_many_hops; if (!xfrm4_route_forward(skb))goto drop;

31 2010/09/17 © by ip_forward(skb) (cont’ed) rt = skb_rtable(skb); if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dst_mtu(&rt->u.dst))); goto drop; } /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; iph = ip_hdr(skb); /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); /* now generate an ICMP HOST REDIRECT giving the route calculated. */ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) ip_rt_send_redirect(skb);

32 2010/09/17 © by ip_forward(skb) (cont’ed) skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish); sr_failed: /* Strict routing permits no gatewaying */ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); goto drop; too_many_hops: /* Tell the sender its packet died... */ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); return NET_RX_DROP; }

33 2010/09/17 © by ip_forward_finish(skb) /net/ipv4/ip_forward.c static int ip_forward_finish(struct sk_buff *skb) { struct ip_options * opt= &(IPCB(skb)->opt); IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen))ip_forward_options(skb); return dst_output(skb); } ip_forward_finish(). This function has actually very little functionality (unless FASTROUTE is enabled). Once the IP options, if used, have been processed in ip_forward_options(), the ip_send() function is invoked to check on whether the packet has to be fragmented and to eventually do a fragmentation, if applicable.

34 2010/09/17 © by ip_forward_options(skb) /net/ipv4/ip_forward.c void ip_forward_options(struct sk_buff *skb) { struct ip_options * opt= &(IPCB(skb)->opt); unsigned char * optptr; struct rtable *rt = skb_rtable(skb); unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; ip_rt_get_source(&optptr[optptr[2]-5], rt); opt->is_changed = 1; } if (opt->srr_is_hit) { int srrptr, srrspace;optptr = raw + opt->srr; for ( srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { if (srrptr + 3 > srrspace) break; if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)break; }

35 2010/09/17 © by ip_forward_options(skb) (cont’ed) if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_rt_get_source(&optptr[srrptr-1], rt); ip_hdr(skb)->daddr = rt->rt_dst; optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); if (opt->ts_needaddr) { optptr = raw + opt->ts; ip_rt_get_source(&optptr[optptr[2]-9], rt); opt->is_changed = 1; } if (opt->is_changed) { opt->is_changed = 0; ip_send_check(ip_hdr(skb)); }

36 2010/09/17 © by ip_send_check(iph) /net/ipv4/ip_output.c /* Generate a checksum for an outgoing IP datagram. */ __inline__ void ip_send_check(struct iphdr *iph){ iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); }

37 2010/09/17 © by ip_queue_xmit(skb, ipfragok) /net/ipv4/ip_output.c int ip_queue_xmit(struct sk_buff *skb, int ipfragok) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options *opt = inet->opt; struct rtable *rt; struct iphdr *iph; rt = skb_rtable(skb); if (rt != NULL)goto packet_routed; /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); if (rt == NULL) { __be32 daddr; /* Use correct destination address if we have options. */ daddr = inet->inet_daddr; if(opt && opt->srr) daddr = opt->faddr;

38 2010/09/17 © by ip_queue_xmit(skb, ipfragok) (cont’ed) { struct flowi fl = {.oif = sk->sk_bound_dev_if,.mark = sk->sk_mark,.nl_u = {.ip4_u = {.daddr = daddr,.saddr = inet->inet_saddr,.tos = RT_CONN_FLAGS(sk) } },.proto = sk->sk_protocol,.flags = inet_sk_flowi_flags(sk),.uli_u = {.ports = {.sport = inet->inet_sport,.dport = inet->inet_dport } } }; security_sk_classify_flow(sk, &fl); if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; }

39 2010/09/17 © by ip_queue_xmit(skb, ipfragok) (cont’ed) sk_setup_caps(sk, &rt->u.dst); } skb_dst_set(skb, dst_clone(&rt->u.dst)); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto no_route; skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 tos & 0xff)); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->u.dst); iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst;

40 2010/09/17 © by ip_queue_xmit(skb, ipfragok) (cont’ed) if (opt && opt->optlen) { iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, inet->inet_daddr, rt, 0); } ip_select_ident_more(iph, &rt->u.dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; return ip_local_out(skb); no_route: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; }

41 2010/09/17 © by ip_local_out(skb) /net/ipv4/ip_output.c int ip_local_out(struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1))err = dst_output(skb); return err; } EXPORT_SYMBOL_GPL(ip_local_out);

42 2010/09/17 © by __ip_local_out(skb) /net/ipv4/ip_output.c int __ip_local_out(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = htons(skb->len); ip_send_check(iph); return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, dst_output); }

43 2010/09/17 © by ip_output(skb) /net/ipv4/ip_output.c int ip_output(struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output,!(IPCB(skb)->flags & IPSKB_REROUTED)); }

44 2010/09/17 © by ip_finish_output(skb) /net/ipv4/ip_output.c static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm != NULL) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } #endif if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); }

45 2010/09/17 © by ip_finish_output2(skb) /net/ipv4/ip_output.c static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));

46 2010/09/17 © by ip_finish_output2(skb) (cont’ed) if (skb2 == NULL) { kfree_skb(skb); return -ENOMEM; } if (skb->sk) skb_set_owner_w(skb2, skb->sk); kfree_skb(skb); skb = skb2; } if (dst->hh)return neigh_hh_output(dst->hh, skb); else if (dst->neighbour)return dst->neighbour->output(skb); if (net_ratelimit()) printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); kfree_skb(skb); return -EINVAL; }

47 2010/09/17 © by Netfilter hooks for connection tracking

48 2010/09/17 © by HF_HOOK() static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *)) { return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN); } static inline int NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *), int thresh) { int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh); if (ret == 1)ret = okfn(skb); return ret; }

49 2010/09/17 © by Arguments of NF_HOOK macro pf (protocol family): This is the identifier of the protocol family: PF_INET for IP Version 4, PF_INET6 for IP Version 6. hook: This is the hook identifier. All valid identifiers for each protocol family are defined in a header file (e.g., ). skb: This is a pointer to the sk_buff structure with the packet to be handled. indev (input device): This is a pointer to the net_device structure of the network device that received the packet. It is set to NULL in the above example, because the packet is an outgoing packet. outdev (output device): This is a pointer to the net_device structure of the network device that should be used by the packet to leave the local computer. In the above example, the device used has to be determined first by use of the routing table (rt). okfn() (okay function): This function is invoked when all filter functions registered with this hook returned NF_ACCEPT, thereby okaying the packet's transit.

50 2010/09/17 © by nf_hook() static inline int nf_hook(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *)) { return nf_hook_thresh(pf, hook, skb, indev, outdev, okfn, INT_MIN); } static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh) { #ifndef CONFIG_NETFILTER_DEBUG if (list_empty(&nf_hooks[pf][hook])) return 1; #endif return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); }

51 2010/09/17 © by nf_hook_thresh() /** nf_hook_thresh - call a netfilter hook Returns 1 if the hook has allowed the packet to pass. The function okfn must be invoked by the caller in this case. Any other return value indicates the packet has been consumed by the hook. */ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh) { #ifndef CONFIG_NETFILTER_DEBUG if (list_empty(&nf_hooks[pf][hook]))return 1; #endif return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh); }

52 2010/09/17 © by nf_hook_slow() /net/netfilter/core.c /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh) { struct list_head *elem; unsigned int verdict; int ret = 0; rcu_read_lock(); elem = &nf_hooks[pf][hook]; next_hook: verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh);

53 2010/09/17 © by nf_hook_slow() (cont’ed) if (verdict == NF_ACCEPT || verdict == NF_STOP) {ret = 1; } else if (verdict == NF_DROP) {kfree_skb(skb);ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, verdict >> NF_VERDICT_BITS)) goto next_hook; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(nf_hook_slow);

54 2010/09/17 © by nf_hook_slow() /net/netfilter/core.c /* Returns 1 if okfn() needs to be executed by the caller, -EPERM for NF_DROP, 0 otherwise. */ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev,struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh) { struct list_head *elem; unsigned int verdict; int ret = 0; rcu_read_lock(); /* We may already have this, but read-locks nest anyway */ elem = &nf_hooks[pf][hook]; next_hook: verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if (verdict == NF_DROP) { kfree_skb(skb); ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,verdict >> NF_VERDICT_BITS)) goto next_hook; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(nf_hook_slow);

55 2010/09/17 © by nf_iterate() /net/netfilter/core.c unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, unsigned int hook, const struct net_device *indev, const struct net_device *outdev, struct list_head **i, int (*okfn)(struct sk_buff *), int hook_thresh) { unsigned int verdict; /* The caller must not block between calls to this function because of risk of continuing from deleted element. */ list_for_each_continue_rcu(*i, head) { struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; if (hook_thresh > elem->priority) continue; /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ verdict = elem->hook(hook, skb, indev, outdev, okfn); if (verdict != NF_ACCEPT) {

56 2010/09/17 © by nf_iterate() (cont’ed) #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", elem->hook, hook); continue; } #endif if (verdict != NF_REPEAT) return verdict; *i = (*i)->prev; } return NF_ACCEPT; }

57 2010/09/17 © by Netfilter hook identifier NF_IP_PRE_ROUTING (0): Incoming packets pass this hook in the ip_rcv() function before they are processed by the routing code. Prior to that, only a few simple consistency checks with regard to the version, length, and checksum fields in the IP header are done. Meaningful opportunities to use this hook result whenever incoming packets should be caught before they are processed—for example, to detect certain types of denial-of-service attacks that operate on poorly built IP packets, or for address-translation mechanisms (NAT), or for accounting functions (counting of incoming packets). NF_IP_LOCAL_IN (1): All incoming packets addressed to the local computer pass this hook in the function ip_local_deliver(). At this point, the iptables module hooks the INPUT rules list into place to filter incoming data packets. This corresponds to the input rules list in ipchains.

58 2010/09/17 © by Netfilter hook identifier (cont’ed) NF_IP_FORWARD (2): All incoming packets not addressed to the local computer pass this hook in the function ip_forward()—that is, packets to be forwarded and leaving the computer over a different network interface.This includes any packet the address of which was modified by NAT. At this point, the iptables module hooks the FORWARD rules list into place to filter forwarded data packets. This corresponds to the forward rules list in ipchains. NF_IP_LOCAL_OUT (3): All outgoing packets created in the local computer pass this hook in the function ip_build_and_send_pkt(). At this point, the iptables module hooks the OUTPUT rules list into place to filter outgoing data packets. This corresponds to the output rules list in ipchains. NF_IP_POST_ROUTING (4): This hook in the ip_finish_output() function represents the last chance to access all outgoing (forwarded or locally created) packets before they leave the computer over a network device. Like the NF_IP_PRE_ROUTING hook, this is a good place to integrate accounting functions.

59 2010/09/17 © by nf_hookfn The packet-filter functions that are actually hooked into the netfilter hooks are so-called hook functions of the type nf_hookfn. The parameters (except for the protocol family identifier) correspond exactly to those of the NF_HOOK macro typedef unsigned int nf_hookfn(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *));

60 2010/09/17 © by Return value of a packet-filter function The return value of a packet-filter function specifies what should happen to the packet. These are defined in. NF_DROP (0): The active rules list processing is stopped, and the packet is dropped. NF_ACCEPT (1): The packet is passed to the next packet filter function in the rules list. Once the end of the list has been reached, the packet is released by okfn() for further processing. NF_STOLEN (2): The packet filter function withholds the packet for further processing, so that the active rules list processing is stopped. In contrast to NF_DROP, however, the packet does not have to be explicitly dropped. NF_QUEUE (3): The function nf_queue() (net/core/netfilter.c) puts the packet in a queue from which it can be removed and processed (e.g., by a user space program). Subsequently, nf_reinject() has to be invoked to return the packet to the Linux kernel for further processing by netfilter. NF_REPEAT (4): In contrast to NF_ACCEPT, rather than a continuation of processing at the next packet-filter function, the current filter function is invoked again.

61 2010/09/17 © by nf_register_hook(), nf_unregister_hook() nf_register_hook(), nf_unregister_hook() registers and unregisters a packet-filter function with the Linux kernel. The parameter passed is a nf_hook_ops structure, which includes all information required. struct nf_hook_ops { struct list_head list; nf_hookfn *hook; struct module *owner; u_int8_t pf; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; };

62 2010/09/17 © by struct nf_hook_ops list: The nf_hook_ops structures are maintained in a linked list within the Linux kernel. hook(): This is a pointer to the actual packet-filter function of the type nf_hookfn. pf, hooknum: The protocol family identifier (e.g., PF_INET or PF_INET6) and the hook identifier (e.g., NF_IP_INPUT) are used to determine the hook for this packet-filter function. priority: Packet-filter functions within the rules list of a hook are sorted by the priority field in ascending order, so that they will be invoked in this order when a packet transits. Priority values are defined as follows, e.g., in : enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK = -200, NF_IP_PRI_MANGLE = -150, NF_IP_PRI_NAT_DST = -100, NF_IP_PRI_FILTER = 0, NF_IP_PRI_NAT_src = 100, NF_IP_PRI_LAST = INT_MAX, };

63 2010/09/17 © by First netfilter example module /* Sample code to install a Netfilter hook function that will * drop all incoming packets. */ #define __KERNEL__ #define MODULE #include static struct nf_hook_ops nfho; unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { return NF_DROP; /* Drop ALL packets */ }

64 2010/09/17 © by First netfilter example module (cont’ed) static int __init init_module(void) { /* Fill in our hook structure */ nfho.hook = my_hookfunc; /* Handler function */ nfho.hooknum = NF_IP_PRE_ROUTING; /* First hook for IPv4 */ nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; /* Make our function first */ nf_register_hook(&nfho); return 0; } static void __exit cleanup_module(void) { nf_unregister_hook(&nfho); } module_init(init_module); module_exit(cleanup_module);

65 2010/09/17 © by Second netfilter example module //For any packet, get the ip header and check the protocol field //if the protocol number equal to UDP (17), log in var/log/messages //default action of module to let all packets through #include static struct nf_hook_ops nfho; //net filter hook option struct struct sk_buff *sock_buff; struct udphdr *udp_header; struct iphdr *ip_header; //ip header struct

66 2010/09/17 © by Second netfilter example module (cont’ed) unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { sock_buff = *skb; ip_header = (struct iphdr *)skb_network_header(sock_buff); if(!sock_buff) { return NF_ACCEPT; } if (ip_header->protocol==17) { udp_header = (struct udphdr *)skb_transport_header(sock_buff); printk(KERN_INFO "got udp packet \n"); //log to /var/log/messages return NF_DROP; } return NF_ACCEPT; }

67 2010/09/17 © by Second netfilter example module (cont’ed) static int __init init_module(void) { nfho.hook = my_hookfunc; nfho.hooknum = NF_IP_PRE_ROUTING; nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; nf_register_hook(&nfho); return 0; } static void __exit cleanup_module(void) { nf_unregister_hook(&nfho); } module_init(init_module); module_exit(cleanup_module);

68 2010/09/17 © by Third netfilter example module /* Sample code to install a Netfilter hook function that will drop all incoming packets from an IP address we specify */ #define __KERNEL__ #define MODULE #include /* The structure used to register filter function */ static struct nf_hook_ops nfho; /* IP address we want to drop packets from, in network byte order */ static unsigned char *drop_ip = "x7fx00x00x01"; /* */

69 2010/09/17 © by Third netfilter example module (cont’ed) /* This is the hook function itself */ unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct sk_buff *sb = *skb; if (sb->nh.iph->saddr == *(unsigned int *)drop_ip) { printk("Dropped packet from... %d.%d.%d.%dn", drop_ip, *(drop_ip + 1), *(drop_ip + 2), *(drop_ip + 3)); return NF_DROP; } else { return NF_ACCEPT; }

70 2010/09/17 © by Third netfilter example module (cont’ed) static int __init init_module(void) { nfho.hook = my_hookfunc; nfho.hooknum = NF_IP_PRE_ROUTING; /* First hook for IPv4 */ nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; /* Make our function first */ nf_register_hook(&nfho); return 0; } /* Cleanup routine */ static void __exit cleanup_module(void){ nf_unregister_hook(&nfho); } module_init(init_module); module_exit(cleanup_module);

71 2010/09/17 © by The module interface of the connection-tracking module is located in the file net/ipv4/netfilter/ip_conntrack_standalone.c. The file net/ipv4/netfilter/ip_conntrack_core.c contains the actual connection- tracking functionality. The connection-tracking module hooks itself into the netfilter hooks NF_IP_PRE_ROUTING and NF_IP_LOCAL_OUT with very high priority (the NF_IP_PRI_CONNTRACK is set to -200 in ). enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK_DEFRAG = -400,NF_IP_PRI_RAW = -300, NF_IP_PRI_SELINUX_FIRST = -225,NF_IP_PRI_CONNTRACK = -200, NF_IP_PRI_MANGLE = -150,NF_IP_PRI_NAT_DST = -100, NF_IP_PRI_FILTER = 0,NF_IP_PRI_SECURITY = 50, NF_IP_PRI_NAT_SRC = 100,NF_IP_PRI_SELINUX_LAST = 225, NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX, NF_IP_PRI_LAST = INT_MAX, };

72 2010/09/17 © by ip_route_input() /net/ipv4/route.c int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct rtable * rth; unsignedhash; int iif = dev->ifindex; struct net *net; net = dev_net(dev); if (!rt_caching(net))goto skip_cache; tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif, rt_genid(net)); rcu_read_lock();

73 2010/09/17 © by ip_route_input() (cont’ed) for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { if (((rth->fl.fl4_dst ^ daddr) | (rth->fl.fl4_src ^ saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search); } rcu_read_unlock();

74 2010/09/17 © by ip_route_input() (cont’ed) skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting network acquires a lot of useless route cache entries, sort of SDR messages from all the world. Now we try to get rid of them. Really, provided software IP multicast filter is organized reasonably (at least, hashed), it does not result in a slowdown comparing with route cache reject entries. Note, that multicast routers are not affected, because route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { struct in_device *in_dev; rcu_read_lock();

75 2010/09/17 © by ip_route_input() (cont’ed) if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { rcu_read_unlock(); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } rcu_read_unlock(); return -EINVAL; } return ip_route_input_slow(skb, daddr, saddr, tos, dev); }

76 2010/09/17 © by ip_route_input_slow() /net/ipv4/route.c static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,u8 tos, struct net_device *dev) { struct fib_result res; struct in_device *in_dev = in_dev_get(dev); struct flowi fl = {.nl_u = {.ip4_u = {.daddr = daddr,.saddr = saddr,.tos = tos,.scope = RT_SCOPE_UNIVERSE, } },.mark = skb->mark,.iif = dev->ifindex }; unsignedflags = 0; u32itag = 0; struct rtable * rth; unsignedhash; __be32spec_dst; interr = -EINVAL, free_res = 0;

77 2010/09/17 © by ip_route_input_slow() (cont’ed) struct net * net = dev_net(dev); /* IP on this device is disabled. */ if (!in_dev)goto out; /*Check for the most weird martians, which can be not detected by fib_lookup. */ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_loopback(saddr))goto martian_source; if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr))goto martian_source; if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) goto martian_destination; /* Now we are ready to route packet. */ if ((err = fib_lookup(net, &fl, &res)) != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; }

78 2010/09/17 © by ip_route_input_slow() (cont’ed) free_res = 1; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_BROADCAST)goto brd_input; if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, net->loopback_dev->ifindex, dev, &spec_dst, &itag, skb->mark); if (result < 0)goto martian_source; if (result)flags |= RTCF_DIRECTSRC; spec_dst = daddr; goto local_input; } if (!IN_DEV_FORWARD(in_dev))goto e_hostunreach; if (res.type != RTN_UNICAST)goto martian_destination; err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);

79 2010/09/17 © by ip_route_input_slow() (cont’ed) done: in_dev_put(in_dev); if (free_res)fib_res_put(&res); out:return err; brd_input: if (skb->protocol != htons(ETH_P_IP))goto e_inval; if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag, skb->mark); if (err < 0)goto martian_source; if (err)flags |= RTCF_DIRECTSRC; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd);

80 2010/09/17 © by ip_route_input_slow() (cont’ed) local_input: rth = dst_alloc(&ipv4_dst_ops); if (!rth)goto e_nobufs; rth->u.dst.output= ip_rt_bug; rth->u.dst.obsolete = -1; rth->rt_genid = rt_genid(net); atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) rth->u.dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst= daddr; rth->rt_dst= daddr; rth->fl.fl4_tos= tos; rth->fl.mark = skb->mark; rth->fl.fl4_src= saddr; rth->rt_src= saddr; #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif

81 2010/09/17 © by ip_route_input_slow() (cont’ed) rth->rt_iif= rth->fl.iif= dev->ifindex; rth->u.dst.dev= net->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev= in_dev_get(rth->u.dst.dev); rth->rt_gateway= daddr; rth->rt_spec_dst= spec_dst; rth->u.dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; if (res.type == RTN_UNREACHABLE) { rth->u.dst.input= ip_error; rth->u.dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type= res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); goto done;

82 2010/09/17 © by ip_route_input_slow() (cont’ed) no_route: RT_CACHE_STAT_INC(in_no_route); spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; if (err == -ESRCH) err = -ENETUNREACH; goto local_input; /* Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif e_hostunreach: err = -EHOSTUNREACH; goto done;

83 2010/09/17 © by ip_route_input_slow() (cont’ed) e_inval: err = -EINVAL; goto done; e_nobufs: err = -ENOBUFS; goto done; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto e_inval; }

84 2010/09/17 © by ip_handle_martian_source() static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* RFC1812, if source is martian, the only hint is MAC header*/ printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; const unsigned char *p = skb_mac_header(skb); printk(KERN_WARNING "ll header: "); for (i = 0; i hard_header_len; i++, p++) { printk("%02x", *p); if (i hard_header_len - 1)) printk(":"); } printk("\n"); } #endif }

85 2010/09/17 © by PF_RING architecture

86 2010/09/17 © by PF_RING PF_RING is a new type of socket based interface. It includes three software modules. A kernel module called ‘PF_RING’ which is written as a new socket protocol type and handles all the socket buffers in both packet reception and transmission. A user space library ‘libpfring’ is used to facilitate user applications access the underlying socket based ring buffer management scheme. The third part of the software modules is a set of example user applications that demonstrated how to use PF_RING. Kernel module called PF_RING (pf_ring.h and pf_ring.c) User library libpfring.a or libpfring.o (pfring.h and pfring.c) Example user application programs such as pfcount.c

87 2010/09/17 © by Some pf_ring.c global variables static struct proto ring_proto; static struct list_head ring_table; static u_int ring_table_size; static struct list_head ring_cluster_list; /* List of all devices on which PF_RING has been registered */ static struct list_head ring_aware_device_list; /* List of all dna (direct nic access) devices */ static struct list_head ring_dna_devices_list; static u_int dna_devices_list_size = 0; /* pf_ring.h #define MAX_NUM_DEVICES 256 */ static struct list_head device_ring_list[MAX_NUM_DEVICES]; static struct net_proto_family ring_family_ops = {.family = PF_RING,.create = ring_create,.owner = THIS_MODULE,}; /* Dummy 'any' device */ static struct net_device any_dev, none_dev;

88 2010/09/17 © by struct proto /* Networking protocol blocks attached to sockets. socket layer -> transport layer interface transport -> network interface is defined by struct inet_proto */ struct proto { void(*close)(struct sock *sk, long timeout); int(*connect)(struct sock *sk,struct sockaddr *uaddr, int addr_len); int(*disconnect)(struct sock *sk, int flags); struct sock *(*accept) (struct sock *sk, int flags, int *err); int(*ioctl)(struct sock *sk, int cmd, unsigned long arg); int(*init)(struct sock *sk); void(*destroy)(struct sock *sk); void(*shutdown)(struct sock *sk, int how); int(*setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); int(*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option);

89 2010/09/17 © by struct proto (cont’ed) #ifdef CONFIG_COMPAT int(*compat_setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); int(*compat_getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); #endif int(*sendmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int(*recvmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int(*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int(*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int(*backlog_rcv) (struct sock *sk, struct sk_buff *skb); /* Keeping track of sk's, looking them up, and port selection methods. */ void(*hash)(struct sock *sk); void(*unhash)(struct sock *sk); int(*get_port)(struct sock *sk, unsigned short snum);

90 2010/09/17 © by struct proto (cont’ed) #ifdef CONFIG_PROC_FS unsigned intinuse_idx; #endif void(*enter_memory_pressure)(struct sock *sk); atomic_t*memory_allocated;/* Current allocated memory. */ struct percpu_counter *sockets_allocated;/* Current num of sockets. */ int*memory_pressure; int*sysctl_mem; int*sysctl_wmem; int*sysctl_rmem; intmax_header; struct kmem_cache*slab; unsigned intobj_size; intslab_flags; struct percpu_counter*orphan_count; struct request_sock_ops*rsk_prot; struct timewait_sock_ops *twsk_prot;

91 2010/09/17 © by struct proto (cont’ed) union { struct inet_hashinfo*hashinfo; struct udp_table*udp_table; struct raw_hashinfo*raw_hash; } h; struct module*owner; charname[32]; struct list_headnode; #ifdef SOCK_REFCNT_DEBUG atomic_tsocks; #endif };

92 2010/09/17 © by struct ring_opt /* Ring options */ struct ring_opt { u_int8_t ring_active, num_rx_channels; struct net_device *ring_netdev; u_short ring_pid; u_int32_t ring_id; char *appl_name; /* String that id the application bound to the socket */ packet_direction direction; /* Specify the capture direction for packets */ struct ring_opt *master_ring; /* Master Ring */ u_int8_t mmap_count; dna_device *dna_device; /* Direct NIC Access */ u_short cluster_id; /* Cluster, 0 = no cluster */ int32_t channel_id; /* Channel, -1 = any channel */ struct net_device *reflector_dev; /* Reflector device */ unsigned long order; /* Packet buffers */ void *ring_memory; /* Ring Slots */

93 2010/09/17 © by struct ring_opt (cont’ed) u_int32_tbucket_len; FlowSlotInfo *slots_info; /* Points to ring_memory */ char *ring_slots; /* Points to ring_memory+sizeof(FlowSlotInfo) */ u_int32_t pktToSample, sample_rate; /* Packet Sampling */ struct sk_filter *bpfFilter; /* BPF Filter */ filtering_hash_bucket **filtering_hash; /* Filtering Rules */ u_int16_t num_filtering_rules; u_int8_t rules_default_accept_policy; /*1=default is accept,drop otherwise */ struct list_head rules; atomic_t num_ring_users;/* Locks */ wait_queue_head_t ring_slots_waitqueue; rwlock_t ring_index_lock, ring_rules_lock; u_int insert_page_id, insert_slot_id;/* Indexes (Internal) */ do_handle_filtering_hash_bucket handle_hash_rule;/* Function pointer */ };

94 2010/09/17 © by struct pfring_hooks /* Hack to jump from a device directly to PF_RING */ struct pfring_hooks { u_int32_t magic; /* Should be set to PF_RING and be the first one */ unsigned int *transparent_mode; handle_ring_skb ring_handler; handle_ring_buffer buffer_ring_handler; handle_add_hdr_to_ring buffer_add_hdr_to_ring; register_pfring_plugin pfring_registration; unregister_pfring_plugin pfring_unregistration; handle_ring_dna_device ring_dna_device_handler; read_device_pfring_free_slots pfring_free_device_slots; };

95 2010/09/17 © by Global variable ring_hooks /* pf_ring.h */ #define PF_RING 27 /* Packet Ring */ #define SOCK_RING PF_RING /* pf_ring.c */ static struct pfring_hooks ring_hooks = {.magic = PF_RING,.transparent_mode = &transparent_mode,.ring_handler = skb_ring_handler,.buffer_ring_handler = buffer_ring_handler,.buffer_add_hdr_to_ring = add_hdr_to_ring,.pfring_registration = register_plugin,.pfring_unregistration = unregister_plugin,.ring_dna_device_handler = dna_device_handler, };

96 2010/09/17 © by ring_init() kernel/pf_ring.c static int __init ring_init(void) { int i, rc; if((rc = proto_register(&ring_proto, 0)) != 0) return(rc); INIT_LIST_HEAD(&ring_table);INIT_LIST_HEAD(&ring_cluster_list); INIT_LIST_HEAD(&ring_aware_device_list); INIT_LIST_HEAD(&ring_dna_devices_list); for (i = 0; i < MAX_NUM_DEVICES; i++) INIT_LIST_HEAD(&device_ring_list[i]); memset(&any_dev, 0, sizeof(any_dev)); strcpy(any_dev.name, "any"); memset(&none_dev, 0, sizeof(none_dev)); strcpy(none_dev.name, "none"); ring_proc_init(); sock_register(&ring_family_ops); register_netdevice_notifier(&ring_netdev_notifier); /* Sanity check */ if(transparent_mode > driver2pf_ring_non_transparent) transparent_mode = standard_linux_path;

97 2010/09/17 © by ring_init() (cont’ed) printk("[PF_RING] Ring slots %d\n", num_slots); printk("[PF_RING] Slot version %d\n", RING_FLOWSLOT_VERSION); printk("[PF_RING] Capture TX %s\n", enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]"); printk("[PF_RING] Transparent Mode %d\n", transparent_mode); printk("[PF_RING] IP Defragment %s\n", enable_ip_defrag ? "Yes" : "No"); printk("[PF_RING] Initialized correctly\n"); register_device_handler(); pfring_enabled = 1; return 0; }

98 2010/09/17 © by ring_proc_init() static void ring_proc_init(void) { ring_proc_dir = proc_mkdir("pf_ring", #if(LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)) init_net. #endif proc_net); if(ring_proc_dir) { #if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)) ring_proc_dir->owner = THIS_MODULE; #endif ring_proc_dev_dir = proc_mkdir(PROC_DEV, ring_proc_dir); ring_proc = create_proc_read_entry(PROC_INFO, 0, ring_proc_dir, ring_proc_get_info, NULL);

99 2010/09/17 © by ring_proc_init() (cont’ed) ring_proc_plugins_info = create_proc_read_entry(PROC_PLUGINS_INFO, 0, ring_proc_dir, ring_proc_get_plugin_info, NULL); if(!ring_proc || !ring_proc_plugins_info) printk("[PF_RING] unable to register proc file\n"); else { #if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)) ring_proc->owner = THIS_MODULE; ring_proc_plugins_info->owner = THIS_MODULE; #endif printk("[PF_RING] registered /proc/net/pf_ring/\n"); } } else printk("[PF_RING] unable to create /proc/net/pf_ring\n"); }

100 2010/09/17 © by INIT_LIST_HEAD() struct list_head { struct list_head *next, *prev; }; static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list;list->prev = list; }

101 2010/09/17 © by register_netdevice_notifier(nb) /net/core/dev.c int register_netdevice_notifier(struct notifier_block *nb) { struct net_device *dev; struct net_device *last; struct net *net;int err; rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err)goto unlock; if (dev_boot_phase)goto unlock; for_each_net(net) { for_each_netdev(net, dev) { err = nb->notifier_call(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err)goto rollback; if (!(dev->flags & IFF_UP))continue; nb->notifier_call(nb, NETDEV_UP, dev); }

102 2010/09/17 © by register_netdevice_notifier(nb) (cont’ed) unlock: rtnl_unlock(); return err; rollback: last = dev; for_each_net(net) { for_each_netdev(net, dev) { if (dev == last) break; if (dev->flags & IFF_UP) { nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier);

103 2010/09/17 © by dev_add_pack(pt) /net/core/dev.c void dev_add_pack(struct packet_type *pt) { int hash; spin_lock_bh(&ptype_lock); if (pt->type == htons(ETH_P_ALL)) list_add_rcu(&pt->list, &ptype_all); else { hash = ntohs(pt->type) & PTYPE_HASH_MASK; list_add_rcu(&pt->list, &ptype_base[hash]); } spin_unlock_bh(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack);

104 2010/09/17 © by ring_notifier() static struct notifier_block ring_netdev_notifier = {.notifier_call = ring_notifier, }; static int ring_notifier(struct notifier_block *this, unsigned long msg, void *data) { struct net_device *dev = data; struct pfring_hooks *hook; switch(msg) { case NETDEV_UP: break; case NETDEV_DOWN: break; case NETDEV_REGISTER: #ifdef RING_DEBUG printk("[PF_RING] packet_notifier(%s) [REGISTER][pfring_ptr=%p]\n", dev->name, dev->pfring_ptr); #endif

105 2010/09/17 © by ring_notifier() (cont’ed) if(dev->pfring_ptr == NULL) { dev->pfring_ptr = &ring_hooks; add_device_to_ring_list(dev); } break; case NETDEV_UNREGISTER: #ifdef RING_DEBUG printk("[PF_RING] packet_notifier(%s) [UNREGISTER][pfring_ptr=%p]\n", dev->name, dev->pfring_ptr); #endif hook = (struct pfring_hooks*)dev->pfring_ptr; if(hook->magic == PF_RING) { remove_device_from_ring_list(dev); dev->pfring_ptr = NULL; } break; case NETDEV_CHANGE: /* Interface state change */ case NETDEV_CHANGEADDR: break;

106 2010/09/17 © by ring_notifier() (cont’ed) case NETDEV_CHANGENAME: /* Rename interface ethX -> ethY */ { struct list_head *ptr, *tmp_ptr; #if defined(RING_DEBUG) printk("[PF_RING] device change name %s\n", dev->name); #endif list_for_each_safe(ptr, tmp_ptr, &ring_aware_device_list) { ring_device_element *dev_ptr = list_entry(ptr, ring_device_element, list); if(dev_ptr->dev == dev) { #if defined(RING_DEBUG) printk("[PF_RING] ==>> FOUND device change name %s\n", dev->name); #endif dev_ptr->proc_entry->name = dev->name; break; } } } break; default: printk("[PF_RING] packet_notifier(%s): unhandled message [msg=%lu][pfring_ptr=%p]\n", dev->name, msg, dev->pfring_ptr); break; } return NOTIFY_DONE; }

107 2010/09/17 © by proto_register() /net/core/sock.c int proto_register(struct proto *prot, int alloc_slab) { if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL); if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", prot->name); goto out; } if (prot->rsk_prot != NULL) { prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);

108 2010/09/17 © by proto_register() (cont’ed) if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab; prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL); if (prot->rsk_prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", prot->name); goto out_free_request_sock_slab_name; } if (prot->twsk_prot != NULL) { prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, “tw_sock_%s", prot->name);

109 2010/09/17 © by proto_register() (cont’ed) if (prot->twsk_prot->twsk_slab_name == NULL) goto out_free_request_sock_slab; prot->twsk_prot->twsk_slab = kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } write_lock(&proto_list_lock); list_add(&prot->node, &proto_list); assign_proto_idx(prot); write_unlock(&proto_list_lock); return 0; out_free_timewait_sock_slab_name: kfree(prot->twsk_prot->twsk_slab_name);

110 2010/09/17 © by proto_register() (cont’ed) out_free_request_sock_slab: if (prot->rsk_prot && prot->rsk_prot->slab) { kmem_cache_destroy(prot->rsk_prot->slab); prot->rsk_prot->slab = NULL; } out_free_request_sock_slab_name: if (prot->rsk_prot) kfree(prot->rsk_prot->slab_name); out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; out: return -ENOBUFS; } EXPORT_SYMBOL(proto_register);

111 2010/09/17 © by sock_register() /net/socket.c /*sock_register - add a socket protocol handler */ int sock_register(const struct net_proto_family *ops) { int err; if (ops->family >= NPROTO) { printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); return -ENOBUFS; } spin_lock(&net_family_lock); if (net_families[ops->family]) err = -EEXIST; else { net_families[ops->family] = ops; err = 0; } spin_unlock(&net_family_lock); printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family); return err; }

112 2010/09/17 © by register_device_handler(void) /* Protocol hook */ static struct packet_type prot_hook; void register_device_handler(void) { if(transparent_mode != standard_linux_path) return; prot_hook.func = packet_rcv; prot_hook.type = htons(ETH_P_ALL); dev_add_pack(&prot_hook); }

113 2010/09/17 © by sk_alloc() /net/sock.c /** sk_alloc - All socket objects are allocated here the applicable net protocol family for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) struct proto associated with this new sock instance */ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); sock_net_set(sk, get_net(net)); atomic_set(&sk->sk_wmem_alloc, 1); } return sk; } EXPORT_SYMBOL(sk_alloc);

114 2010/09/17 © by ring_create() static int ring_create(struct net *net,struct socket *sock, int protocol, int kern) { struct sock *sk; struct ring_opt *pfr; int err; #if defined(RING_DEBUG) printk("[PF_RING] ring_create()\n"); #endif if(!capable(CAP_NET_ADMIN)) return -EPERM; if(sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if(protocol != htons(ETH_P_ALL)) return -EPROTONOSUPPORT; err = -ENOMEM; #if(LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL); #else #if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)) sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1); #else sk = sk_alloc(net, PF_INET, GFP_KERNEL, &ring_proto); #endif

115 2010/09/17 © by ring_create() (cont’ed) if(sk == NULL) goto out; sock->ops = &ring_ops; sock_init_data(sock, sk); #if(LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) sk_set_owner(sk, THIS_MODULE); #endif err = -ENOMEM; ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL)); if(!(pfr = ring_sk(sk))) { sk_free(sk); goto out; } memset(pfr, 0, sizeof(*pfr)); pfr->ring_active = 0; /* Activate as soon as somebody waits for pakts */ pfr->num_rx_channels = UNKNOWN_NUM_RX_CHANNELS; pfr->channel_id = RING_ANY_CHANNEL; pfr->bucket_len = DEFAULT_BUCKET_LEN; pfr->handle_hash_rule = handle_filtering_hash_bucket;

116 2010/09/17 © by ring_create() (cont’ed) init_waitqueue_head(&pfr->ring_slots_waitqueue); rwlock_init(&pfr->ring_index_lock); rwlock_init(&pfr->ring_rules_lock); atomic_set(&pfr->num_ring_users, 0); INIT_LIST_HEAD(&pfr->rules); sk->sk_family = PF_RING; sk->sk_destruct = ring_sock_destruct; ring_insert(sk); pfr->master_ring = NULL; pfr->ring_netdev = &none_dev; /* Unbound socket */ pfr->sample_rate = 1; /* No sampling */ pfr->ring_pid = current->pid; pfr->ring_id = ring_id_serial++; ring_proc_add(pfr); #if defined(RING_DEBUG) printk("[PF_RING] ring_create(): created\n"); #endif return(0); out: return err; }

117 2010/09/17 © by packet_rcv(skb,dev,pt,orig_dev) static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { int rc; if(skb->pkt_type != PACKET_LOOPBACK) { rc = skb_ring_handler(skb, (skb->pkt_type == PACKET_OUTGOING) ? 0 : 1,1, UNKNOWN_RX_CHANNEL, UNKNOWN_NUM_RX_CHANNELS); } else rc = 0; kfree_skb(skb); return(rc); }

118 2010/09/17 © by skb_ring_handler() static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet, u_char real_skb /* 1=real skb, 0=faked skb */, u_int8_t channel_id, u_int8_t num_rx_channels) { struct sock *skElement; int rc = 0, is_ip_pkt, displ; struct list_head *ptr; struct pfring_pkthdr hdr; struct sk_buff *skk = NULL, *orig_skb = skb;

119 2010/09/17 © by skb_ring_handler() (cont’ed) if((!skb) ||((!enable_tx_capture) && (!recv_packet))) { /*An outgoing packet is about to be sent out but we decided not to handle transmitted packets. */ return(0); } if(recv_packet) { /* Hack for identifying a packet received by the e1000 */ if(real_skb) displ = SKB_DISPLACEMENT; else displ = 0;/* Received by the e1000 wrapper */ } else displ = 0; is_ip_pkt = parse_pkt(skb, displ, &hdr);

120 2010/09/17 © by skb_ring_handler() (cont’ed) if(enable_ip_defrag && real_skb && is_ip_pkt && recv_packet && (ring_table_size > 0)) { struct sk_buff *cloned = NULL; struct iphdr *iphdr = NULL; skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_set_network_header(skb, ETH_HLEN - displ); iphdr = ip_hdr(skb); if(iphdr) { if(iphdr->frag_off & htons(IP_MF | IP_OFFSET)) { if((cloned = skb_clone(skb, GFP_ATOMIC)) != NULL) { skk = ring_gather_frags(cloned); if(skk != NULL) { skb = skk; parse_pkt(skb, displ, &hdr); hdr.len = hdr.caplen = skb->len + displ; } else { return(0);/* mask rcvd fragments */ }

121 2010/09/17 © by skb_ring_handler() (cont’ed) if(skb->tstamp.tv64 == 0) __net_timestamp(skb); hdr.ts = ktime_to_timeval(skb->tstamp); hdr.len = hdr.caplen = skb->len + displ; /* Avoid the ring to be manipulated while playing with it */ read_lock_bh(&ring_mgmt_lock); /* [1] Check unclustered sockets */ list_for_each(ptr, &ring_table) { struct ring_opt *pfr; struct ring_element *entry; entry = list_entry(ptr, struct ring_element, list); skElement = entry->sk; pfr = ring_sk(skElement);

122 2010/09/17 © by skb_ring_handler() (cont’ed) if( (pfr != NULL) && (pfr->ring_netdev != &none_dev) && (pfr->cluster_id == 0 )&& (pfr->ring_slots != NULL) && is_valid_skb_direction(pfr->direction, recv_packet) && ((pfr->ring_netdev == skb->dev) || (pfr->ring_netdev == &any_dev) /* Socket bound to 'any' */ || ((skb->dev->flags & IFF_SLAVE) && (pfr->ring_netdev == skb->dev->master)))) { /* We've found the ring where the packet can be stored */ int old_caplen = hdr.caplen;/* Keep old lenght */ hdr.caplen = min(hdr.caplen, pfr->bucket_len); add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ, channel_id, num_rx_channels); hdr.caplen = old_caplen; rc = 1;/* Ring found: we've done our job */ }

123 2010/09/17 © by skb_ring_handler() (cont’ed) /* [2] Check socket clusters */ list_for_each(ptr, &ring_cluster_list) { ring_cluster_element *cluster_ptr; struct ring_opt *pfr; cluster_ptr = list_entry(ptr, ring_cluster_element, list); if(cluster_ptr->cluster.num_cluster_elements > 0) { u_int skb_hash = hash_pkt_cluster(cluster_ptr, &hdr); u_short num_iterations; for(num_iterations = 0; num_iterations cluster.num_cluster_elements; num_iterations++) { skElement = cluster_ptr->cluster.sk[skb_hash]; if(skElement != NULL) { pfr = ring_sk(skElement);

124 2010/09/17 © by skb_ring_handler() (cont’ed) if((pfr != NULL) && (pfr->ring_slots != NULL) && ((pfr->ring_netdev == skb->dev) || ((skb->dev->flags & IFF_SLAVE) && (pfr->ring_netdev == skb->dev->master))) && is_valid_skb_direction(pfr->direction, recv_packet) ) { FlowSlot *theSlot = get_insert_slot(pfr); if((theSlot == NULL) || (theSlot->slot_state == 0 /* Not full */)) { /* We've found the ring where the packet can be stored */ add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ, channel_id, num_rx_channels); rc = 1; /* Ring found: we've done our job */ break; }

125 2010/09/17 © by skb_ring_handler() (cont’ed) if(cluster_ptr->cluster.hashing_mode != cluster_round_robin) break; else skb_hash = (skb_hash + 1) % cluster_ptr>cluster.num_cluster_elements; } } /* Clustering */ read_unlock_bh(&ring_mgmt_lock); /* Fragment handling */ if(skk != NULL) kfree_skb(skk); if(rc == 1) { if(transparent_mode != driver2pf_ring_non_transparent) { rc = 0; } else { if(recv_packet && real_skb) { kfree_skb(orig_skb); } return(rc);/* 0 = packet not handled */ }

126 2010/09/17 © by User space library All begins with ‘pfring_’ prefix. A struct pfring in user space keeps all the needed information using the underlying PF_RING module. int pfring_recv(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet); pfring* pfring_open(char *device_name, u_int8_t promisc, u_int32_t caplen, u_int8_t reentrant); int pfring_bind(pfring *ring, char *device_name); int pfring_read(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet, u_int8_t consume_packet_immediately);

127 2010/09/17 © by struct pfring typedef struct { /* DNA (Direct NIC Access) */ u_char dna_mapped_device; u_int32_t tot_dna_read_pkts, rx_reg; dna_device dna_dev; u_int32_t *rx_reg_ptr[MAX_NUM_RX_CHANNELS]; /* All devices */ char *buffer, *slots, *device_name; int fd; FlowSlotInfo *slots_info; FlowSlot *last_slot_to_update; u_int page_id, slot_id, pkts_per_page; u_int poll_sleep; u_int8_t clear_promisc, reentrant; u_long num_poll_calls; pthread_spinlock_t spinlock; } pfring;

128 2010/09/17 © by pfring_open() pfring* pfring_open(char *device_name, u_int8_t promisc, u_int32_t caplen, u_int8_t _reentrant) { int err = 0; pfring *ring = (pfring*)malloc(sizeof(pfring)); if(ring == NULL) return(NULL); elsememset(ring, 0, sizeof(pfring)); ring->reentrant = _reentrant; ring->fd = socket(PF_RING, SOCK_RAW, htons(ETH_P_ALL)); if(ring->fd > 0) { int rc; u_int memSlotsLen; if(caplen > MAX_CAPLEN) caplen = MAX_CAPLEN; setsockopt(ring->fd, 0, SO_RING_BUCKET_LEN, &caplen, sizeof(caplen)); if((device_name == NULL) || (strcmp(device_name, "none") == 0)) { rc = 0; /* No binding yet */ } else rc = pfring_bind(ring, device_name);

129 2010/09/17 © by pfring_open() (cont’ed) if(rc == 0) { ring->buffer = (char *)mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, ring->fd, 0); if(ring->buffer == MAP_FAILED) { printf("mmap() failed");free(ring);return(NULL); } ring->slots_info = (FlowSlotInfo *)ring->buffer; if(ring->slots_info->version != RING_FLOWSLOT_VERSION) { printf("Wrong RING version: kernel is %i, libpfring was compiled with %i\n", ring->slots_info->version, RING_FLOWSLOT_VERSION); free(ring);return(NULL); } memSlotsLen = ring->slots_info->tot_mem; munmap(ring->buffer, PAGE_SIZE); ring->buffer = (char *)mmap(NULL, memSlotsLen, PROT_READ|PROT_WRITE, MAP_SHARED, ring->fd, 0); if(ring->buffer == MAP_FAILED) { printf("mmap() failed"); free(ring);return(NULL); }

130 2010/09/17 © by pfring_open() (cont’ed) ring->slots_info = (FlowSlotInfo *)ring->buffer; ring->slots = (char *)(ring->buffer+sizeof(FlowSlotInfo)); if(ring->slots_info->remove_idx >= ring->slots_info->tot_slots) ring->slots_info->remove_idx = 0; ring->page_id = PAGE_SIZE, ring->slot_id = 0, ring->pkts_per_page = 0; ring->device_name = strdup(device_name); if(promisc) { if(set_if_promisc(device_name, 1) == 0) ring->clear_promisc = 1; } } else { close(ring->fd);err = -1; } } else { err = -1;free(ring); } if(err == 0) { if(ring->reentrant) pthread_spin_init(&ring->spinlock, PTHREAD_PROCESS_PRIVATE); return(ring); } else return(NULL); }

131 2010/09/17 © by pfring_bind() int pfring_bind(pfring *ring, char *device_name) { struct sockaddr sa; char *at; int32_t channel_id = -1; int rc = 0; if((device_name==NULL) || (strcmp(device_name, "none") ==0)) return(-1); at = strchr(device_name, if(at != NULL) { char *tok, *pos = NULL; at[0] = '\0'; /* Syntax : channel 1 and 5, channel 1,2...5, channel 1,2,3,5,6,7 */ tok = strtok_r(&at[1], ",", &pos); channel_id = 0;

132 2010/09/17 © by pfring_bind() (cont’ed) while(tok != NULL) { char *dash = strchr(tok, '-'); int32_t min_val, max_val, i; if(dash) { dash[0] = '\0'; min_val = atoi(tok); max_val = atoi(&dash[1]); } else min_val = max_val = atoi(tok); for(i = min_val; i <= max_val; i++)channel_id |= 1 << i; tok = strtok_r(NULL, ",", &pos); } sa.sa_family = PF_RING; snprintf(sa.sa_data, sizeof(sa.sa_data), "%s", device_name); rc = bind(ring->fd, (struct sockaddr *)&sa, sizeof(sa)); if(rc == 0) { if(channel_id != -1) { int rc = pfring_set_channel_id(ring, channel_id); if(rc != 0) printf("pfring_set_channel_id() failed: %d\n", rc); } return(rc); }

133 2010/09/17 © by pfring_recv() pfring_recv() is just a wrapper of the pfring_read() function int pfring_recv(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet) { return(pfring_read(ring, buffer, buffer_len, hdr, wait_for_incoming_packet, 1)); }

134 2010/09/17 © by pfring_read() int pfring_read(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet,u_int8_t consume_packet_immediately) { if(ring == NULL) return(-1); if(ring->reentrant) { /* Late packet consumers is not supported in multithreaded env. as threads can steal each other's packets */ consume_packet_immediately = 1; } if(ring->dna_mapped_device) {char *pkt = NULL; if(wait_for_incoming_packet) { if(ring->reentrant) pthread_spin_lock(&ring->spinlock); switch(ring->dna_dev.device_model) { case intel_e1000: e1000_there_is_a_packet_to_read(ring, wait_for_incoming_packet); break; default: return(0); } if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); }

135 2010/09/17 © by pfring_read() (cont’ed) switch(ring->dna_dev.device_model) { case intel_e1000: pkt=get_next_e1000_packet(ring,buffer,buffer_len,hdr); break; case intel_igb: pkt = NULL, hdr->len = 0; break; case intel_ixgbe: pkt = NULL, hdr->len = 0; break; } if(pkt && (hdr->len > 0)) { /* Set the (1) below to (0) for enabling packet parsing for DNA devices */ if(1)hdr->parsed_header_len = 0; elseparse_pkt(buffer, hdr); return(1); } else return(0); } else { FlowSlot *slot; u_int32_t queuedPkts; #ifdef USE_ADAPTIVE_WAIT u_int32_t num_loops = 0; #endif if((ring == NULL) || (ring->buffer == NULL)) return(-1); if(ring->last_slot_to_update) pfring_notify(ring, REFLECT_PACKET_DEVICE_NONE);

136 2010/09/17 © by pfring_read() (cont’ed) do_pfring_recv: if(ring->reentrant) pthread_spin_lock(&ring->spinlock); slot = (FlowSlot*)&ring->slots[ring->slots_info->remove_idx*ring- >slots_info->slot_len]; if(ring->slots_info->tot_insert >= ring->slots_info->tot_read) queuedPkts = ring->slots_info->tot_insert - ring->slots_info->tot_read; else queuedPkts = ring->slots_info->tot_slots + ring->slots_info->tot_insert - ring->slots_info->tot_read; if(queuedPkts && (slot->slot_state == 1 /* There's a packet to read */)) { char *bucket = (char*)&slot->bucket; struct pfring_pkthdr *_hdr = (struct pfring_pkthdr*)bucket; int bktLen = _hdr->caplen+_hdr->parsed_header_len; if(bktLen > buffer_len) bktLen = buffer_len-1; if(buffer && (bktLen > 0)) { memcpy(buffer, &bucket[sizeof(struct pfring_pkthdr)], bktLen); bucket[bktLen] = '\0'; }

137 2010/09/17 © by pfring_read() (cont’ed) if(ring->slots_info->remove_idx >= (ring->slots_info->tot_slots-1)) { ring->slots_info->remove_idx = 0; ring->page_id = PAGE_SIZE, ring->slot_id = 0, ring->pkts_per_page = 0; } else { ring->slots_info->remove_idx++; ring->pkts_per_page++, ring->slot_id += ring->slots_info->slot_len; } if(hdr) memcpy(hdr, _hdr, sizeof(struct pfring_pkthdr)); ring->slots_info->tot_read++; if(consume_packet_immediately) { ring->last_slot_to_update = NULL, slot->slot_state = 0; /* Empty slot */ } else { /* We do not notify pf_ring that the packet has been read hence this slot will not be available for storing a new packet until we notify pf_ring */ ring->last_slot_to_update = slot; }

138 2010/09/17 © by pfring_read() (cont’ed) if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); return(1); } else { if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); if(wait_for_incoming_packet) { struct pollfd pfd; int rc; #ifdef USE_ADAPTIVE_WAIT /* Spin in userland for a while and if no packet arrives then it's time to poll the kernel. Only do poll() if there is no chance to avoid it, as a call to poll() is too costly */ if(num_loops < MAX_NUM_LOOPS) { num_loops++; if(num_loops % YIELD_MULTIPLIER) { sched_yield(); } #endif

139 2010/09/17 © by pfring_read() (cont’ed) /* Sleep when nothing is happening */ pfd.fd = ring->fd; pfd.events = POLLIN|POLLERR; pfd.revents = 0; errno = 0; rc = poll(&pfd, 1, -1); ring->num_poll_calls++; if(rc == -1) return(-1); else goto do_pfring_recv; } return(-1); /* Not reached */ }

140 2010/09/17 © by pfcount.c main() { /* Omitted.. argument processing codes */ if(device == NULL) device = DEFAULT_DEVICE; if(num_threads > MAX_NUM_THREADS) num_threads=MAX_NUM_THREADS; printf("Capturing from %s\n", device); /* hardcode: promisc=1, to_ms=500 */ promisc = 1; if(num_threads > 0) pthread_rwlock_init(&statsLock, NULL); if(!dna_mode) pd = pfring_open(device, promisc, snaplen, (num_threads > 0) ? 1 : 0); #ifdef ENABLE_DNA_SUPPORT else pd = pfring_open_dna(device, 0 /* we don't use threads */); #endif /* Omitted … check pd to see if pfring_open() error */ /* Omitted … set filtering rule */

141 2010/09/17 © by pfcount.c (cont’ed) signal(SIGINT, sigproc); signal(SIGTERM, sigproc); signal(SIGINT, sigproc); if(!verbose) { signal(SIGALRM, my_sigalarm); alarm(ALARM_SLEEP); } if(dna_mode) num_threads = 1; else { if(num_threads > 0) wait_for_packet = 1; } if(!wait_for_packet) pfring_enable_ring(pd); if(num_threads > 1) { pthread_t my_thread; int i; for(i=1; i

142 2010/09/17 © by packet_consumer_thread() void* packet_consumer_thread(void* _id) { while(1) { struct simple_stats { u_int64_t num_pkts, num_bytes; }; u_char buffer[2048]; struct simple_stats stats; struct pfring_pkthdr hdr; int rc; u_int len; if(do_shutdown) break; if(pfring_recv(pd, (char*)buffer, sizeof(buffer), &hdr, wait_for_packet) > 0) { if(do_shutdown) break; dummyProcesssPacket(&hdr, buffer); } if(0) { len = sizeof(stats); rc = pfring_get_filtering_rule_stats(pd, 5, (char*)&stats, &len); if(rc < 0) printf("pfring_get_filtering_rule_stats() failed [rc=%d]\n", rc); else { printf("[Pkts=%u][Bytes=%u]\n", (unsigned int)stats.num_pkts, (unsigned int)stats.num_bytes); } } return(NULL); }

143 2010/09/17 © by Questions?


Download ppt "Linux Network Architecture Network Layer Isaac Y. Tsai."

Similar presentations


Ads by Google