Version:  2.0.40 2.2.26 2.4.37 2.6.39 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 3.10 3.11 3.12 3.13 3.14 3.15

Linux/net/ipv4/route.c

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              ROUTE - implementation of the IP router.
  7  *
  8  * Authors:     Ross Biro
  9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 13  *
 14  * Fixes:
 15  *              Alan Cox        :       Verify area fixes.
 16  *              Alan Cox        :       cli() protects routing changes
 17  *              Rui Oliveira    :       ICMP routing table updates
 18  *              (rco@di.uminho.pt)      Routing table insertion and update
 19  *              Linus Torvalds  :       Rewrote bits to be sensible
 20  *              Alan Cox        :       Added BSD route gw semantics
 21  *              Alan Cox        :       Super /proc >4K
 22  *              Alan Cox        :       MTU in route table
 23  *              Alan Cox        :       MSS actually. Also added the window
 24  *                                      clamper.
 25  *              Sam Lantinga    :       Fixed route matching in rt_del()
 26  *              Alan Cox        :       Routing cache support.
 27  *              Alan Cox        :       Removed compatibility cruft.
 28  *              Alan Cox        :       RTF_REJECT support.
 29  *              Alan Cox        :       TCP irtt support.
 30  *              Jonathan Naylor :       Added Metric support.
 31  *      Miquel van Smoorenburg  :       BSD API fixes.
 32  *      Miquel van Smoorenburg  :       Metrics.
 33  *              Alan Cox        :       Use __u32 properly
 34  *              Alan Cox        :       Aligned routing errors more closely with BSD
 35  *                                      our system is still very different.
 36  *              Alan Cox        :       Faster /proc handling
 37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 38  *                                      routing caches and better behaviour.
 39  *
 40  *              Olaf Erb        :       irtt wasn't being copied right.
 41  *              Bjorn Ekwall    :       Kerneld route support.
 42  *              Alan Cox        :       Multicast fixed (I hope)
 43  *              Pavel Krauz     :       Limited broadcast fixed
 44  *              Mike McLagan    :       Routing by source
 45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
 46  *                                      route.c and rewritten from scratch.
 47  *              Andi Kleen      :       Load-limit warning messages.
 48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 52  *              Marc Boucher    :       routing by fwmark
 53  *      Robert Olsson           :       Added rt_cache statistics
 54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
 55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
 57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
 58  *
 59  *              This program is free software; you can redistribute it and/or
 60  *              modify it under the terms of the GNU General Public License
 61  *              as published by the Free Software Foundation; either version
 62  *              2 of the License, or (at your option) any later version.
 63  */
 64 
 65 #define pr_fmt(fmt) "IPv4: " fmt
 66 
 67 #include <linux/module.h>
 68 #include <asm/uaccess.h>
 69 #include <linux/bitops.h>
 70 #include <linux/types.h>
 71 #include <linux/kernel.h>
 72 #include <linux/mm.h>
 73 #include <linux/string.h>
 74 #include <linux/socket.h>
 75 #include <linux/sockios.h>
 76 #include <linux/errno.h>
 77 #include <linux/in.h>
 78 #include <linux/inet.h>
 79 #include <linux/netdevice.h>
 80 #include <linux/proc_fs.h>
 81 #include <linux/init.h>
 82 #include <linux/skbuff.h>
 83 #include <linux/inetdevice.h>
 84 #include <linux/igmp.h>
 85 #include <linux/pkt_sched.h>
 86 #include <linux/mroute.h>
 87 #include <linux/netfilter_ipv4.h>
 88 #include <linux/random.h>
 89 #include <linux/rcupdate.h>
 90 #include <linux/times.h>
 91 #include <linux/slab.h>
 92 #include <net/dst.h>
 93 #include <net/net_namespace.h>
 94 #include <net/protocol.h>
 95 #include <net/ip.h>
 96 #include <net/route.h>
 97 #include <net/inetpeer.h>
 98 #include <net/sock.h>
 99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define RT_GC_TIMEOUT (300*HZ)
116 
117 static int ip_rt_max_size;
118 static int ip_rt_redirect_number __read_mostly  = 9;
119 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
120 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
121 static int ip_rt_error_cost __read_mostly       = HZ;
122 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
123 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
124 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
125 static int ip_rt_min_advmss __read_mostly       = 256;
126 
127 /*
128  *      Interface to generic destination cache.
129  */
130 
131 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
132 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
133 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
134 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
135 static void              ipv4_link_failure(struct sk_buff *skb);
136 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
137                                            struct sk_buff *skb, u32 mtu);
138 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
139                                         struct sk_buff *skb);
140 static void             ipv4_dst_destroy(struct dst_entry *dst);
141 
142 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
143 {
144         WARN_ON(1);
145         return NULL;
146 }
147 
148 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
149                                            struct sk_buff *skb,
150                                            const void *daddr);
151 
152 static struct dst_ops ipv4_dst_ops = {
153         .family =               AF_INET,
154         .protocol =             cpu_to_be16(ETH_P_IP),
155         .check =                ipv4_dst_check,
156         .default_advmss =       ipv4_default_advmss,
157         .mtu =                  ipv4_mtu,
158         .cow_metrics =          ipv4_cow_metrics,
159         .destroy =              ipv4_dst_destroy,
160         .negative_advice =      ipv4_negative_advice,
161         .link_failure =         ipv4_link_failure,
162         .update_pmtu =          ip_rt_update_pmtu,
163         .redirect =             ip_do_redirect,
164         .local_out =            __ip_local_out,
165         .neigh_lookup =         ipv4_neigh_lookup,
166 };
167 
168 #define ECN_OR_COST(class)      TC_PRIO_##class
169 
170 const __u8 ip_tos2prio[16] = {
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(BESTEFFORT),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK)
187 };
188 EXPORT_SYMBOL(ip_tos2prio);
189 
190 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
191 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
192 
193 #ifdef CONFIG_PROC_FS
194 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
195 {
196         if (*pos)
197                 return NULL;
198         return SEQ_START_TOKEN;
199 }
200 
201 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
202 {
203         ++*pos;
204         return NULL;
205 }
206 
207 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
208 {
209 }
210 
211 static int rt_cache_seq_show(struct seq_file *seq, void *v)
212 {
213         if (v == SEQ_START_TOKEN)
214                 seq_printf(seq, "%-127s\n",
215                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
216                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
217                            "HHUptod\tSpecDst");
218         return 0;
219 }
220 
221 static const struct seq_operations rt_cache_seq_ops = {
222         .start  = rt_cache_seq_start,
223         .next   = rt_cache_seq_next,
224         .stop   = rt_cache_seq_stop,
225         .show   = rt_cache_seq_show,
226 };
227 
228 static int rt_cache_seq_open(struct inode *inode, struct file *file)
229 {
230         return seq_open(file, &rt_cache_seq_ops);
231 }
232 
233 static const struct file_operations rt_cache_seq_fops = {
234         .owner   = THIS_MODULE,
235         .open    = rt_cache_seq_open,
236         .read    = seq_read,
237         .llseek  = seq_lseek,
238         .release = seq_release,
239 };
240 
241 
242 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
243 {
244         int cpu;
245 
246         if (*pos == 0)
247                 return SEQ_START_TOKEN;
248 
249         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
250                 if (!cpu_possible(cpu))
251                         continue;
252                 *pos = cpu+1;
253                 return &per_cpu(rt_cache_stat, cpu);
254         }
255         return NULL;
256 }
257 
258 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
259 {
260         int cpu;
261 
262         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
263                 if (!cpu_possible(cpu))
264                         continue;
265                 *pos = cpu+1;
266                 return &per_cpu(rt_cache_stat, cpu);
267         }
268         return NULL;
269 
270 }
271 
272 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
273 {
274 
275 }
276 
277 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
278 {
279         struct rt_cache_stat *st = v;
280 
281         if (v == SEQ_START_TOKEN) {
282                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
283                 return 0;
284         }
285 
286         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
287                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
288                    dst_entries_get_slow(&ipv4_dst_ops),
289                    0, /* st->in_hit */
290                    st->in_slow_tot,
291                    st->in_slow_mc,
292                    st->in_no_route,
293                    st->in_brd,
294                    st->in_martian_dst,
295                    st->in_martian_src,
296 
297                    0, /* st->out_hit */
298                    st->out_slow_tot,
299                    st->out_slow_mc,
300 
301                    0, /* st->gc_total */
302                    0, /* st->gc_ignored */
303                    0, /* st->gc_goal_miss */
304                    0, /* st->gc_dst_overflow */
305                    0, /* st->in_hlist_search */
306                    0  /* st->out_hlist_search */
307                 );
308         return 0;
309 }
310 
311 static const struct seq_operations rt_cpu_seq_ops = {
312         .start  = rt_cpu_seq_start,
313         .next   = rt_cpu_seq_next,
314         .stop   = rt_cpu_seq_stop,
315         .show   = rt_cpu_seq_show,
316 };
317 
318 
319 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
320 {
321         return seq_open(file, &rt_cpu_seq_ops);
322 }
323 
324 static const struct file_operations rt_cpu_seq_fops = {
325         .owner   = THIS_MODULE,
326         .open    = rt_cpu_seq_open,
327         .read    = seq_read,
328         .llseek  = seq_lseek,
329         .release = seq_release,
330 };
331 
332 #ifdef CONFIG_IP_ROUTE_CLASSID
333 static int rt_acct_proc_show(struct seq_file *m, void *v)
334 {
335         struct ip_rt_acct *dst, *src;
336         unsigned int i, j;
337 
338         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
339         if (!dst)
340                 return -ENOMEM;
341 
342         for_each_possible_cpu(i) {
343                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
344                 for (j = 0; j < 256; j++) {
345                         dst[j].o_bytes   += src[j].o_bytes;
346                         dst[j].o_packets += src[j].o_packets;
347                         dst[j].i_bytes   += src[j].i_bytes;
348                         dst[j].i_packets += src[j].i_packets;
349                 }
350         }
351 
352         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
353         kfree(dst);
354         return 0;
355 }
356 
357 static int rt_acct_proc_open(struct inode *inode, struct file *file)
358 {
359         return single_open(file, rt_acct_proc_show, NULL);
360 }
361 
362 static const struct file_operations rt_acct_proc_fops = {
363         .owner          = THIS_MODULE,
364         .open           = rt_acct_proc_open,
365         .read           = seq_read,
366         .llseek         = seq_lseek,
367         .release        = single_release,
368 };
369 #endif
370 
371 static int __net_init ip_rt_do_proc_init(struct net *net)
372 {
373         struct proc_dir_entry *pde;
374 
375         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
376                           &rt_cache_seq_fops);
377         if (!pde)
378                 goto err1;
379 
380         pde = proc_create("rt_cache", S_IRUGO,
381                           net->proc_net_stat, &rt_cpu_seq_fops);
382         if (!pde)
383                 goto err2;
384 
385 #ifdef CONFIG_IP_ROUTE_CLASSID
386         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
387         if (!pde)
388                 goto err3;
389 #endif
390         return 0;
391 
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393 err3:
394         remove_proc_entry("rt_cache", net->proc_net_stat);
395 #endif
396 err2:
397         remove_proc_entry("rt_cache", net->proc_net);
398 err1:
399         return -ENOMEM;
400 }
401 
402 static void __net_exit ip_rt_do_proc_exit(struct net *net)
403 {
404         remove_proc_entry("rt_cache", net->proc_net_stat);
405         remove_proc_entry("rt_cache", net->proc_net);
406 #ifdef CONFIG_IP_ROUTE_CLASSID
407         remove_proc_entry("rt_acct", net->proc_net);
408 #endif
409 }
410 
411 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
412         .init = ip_rt_do_proc_init,
413         .exit = ip_rt_do_proc_exit,
414 };
415 
416 static int __init ip_rt_proc_init(void)
417 {
418         return register_pernet_subsys(&ip_rt_proc_ops);
419 }
420 
421 #else
422 static inline int ip_rt_proc_init(void)
423 {
424         return 0;
425 }
426 #endif /* CONFIG_PROC_FS */
427 
428 static inline bool rt_is_expired(const struct rtable *rth)
429 {
430         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 }
432 
433 void rt_cache_flush(struct net *net)
434 {
435         rt_genid_bump_ipv4(net);
436 }
437 
438 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
439                                            struct sk_buff *skb,
440                                            const void *daddr)
441 {
442         struct net_device *dev = dst->dev;
443         const __be32 *pkey = daddr;
444         const struct rtable *rt;
445         struct neighbour *n;
446 
447         rt = (const struct rtable *) dst;
448         if (rt->rt_gateway)
449                 pkey = (const __be32 *) &rt->rt_gateway;
450         else if (skb)
451                 pkey = &ip_hdr(skb)->daddr;
452 
453         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454         if (n)
455                 return n;
456         return neigh_create(&arp_tbl, pkey, dev);
457 }
458 
459 /*
460  * Peer allocation may fail only in serious out-of-memory conditions.  However
461  * we still can generate some output.
462  * Random ID selection looks a bit dangerous because we have no chances to
463  * select ID being unique in a reasonable period of time.
464  * But broken packet identifier may be better than no packet at all.
465  */
466 static void ip_select_fb_ident(struct iphdr *iph)
467 {
468         static DEFINE_SPINLOCK(ip_fb_id_lock);
469         static u32 ip_fallback_id;
470         u32 salt;
471 
472         spin_lock_bh(&ip_fb_id_lock);
473         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474         iph->id = htons(salt & 0xFFFF);
475         ip_fallback_id = salt;
476         spin_unlock_bh(&ip_fb_id_lock);
477 }
478 
479 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
480 {
481         struct net *net = dev_net(dst->dev);
482         struct inet_peer *peer;
483 
484         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
485         if (peer) {
486                 iph->id = htons(inet_getid(peer, more));
487                 inet_putpeer(peer);
488                 return;
489         }
490 
491         ip_select_fb_ident(iph);
492 }
493 EXPORT_SYMBOL(__ip_select_ident);
494 
495 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
496                              const struct iphdr *iph,
497                              int oif, u8 tos,
498                              u8 prot, u32 mark, int flow_flags)
499 {
500         if (sk) {
501                 const struct inet_sock *inet = inet_sk(sk);
502 
503                 oif = sk->sk_bound_dev_if;
504                 mark = sk->sk_mark;
505                 tos = RT_CONN_FLAGS(sk);
506                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
507         }
508         flowi4_init_output(fl4, oif, mark, tos,
509                            RT_SCOPE_UNIVERSE, prot,
510                            flow_flags,
511                            iph->daddr, iph->saddr, 0, 0);
512 }
513 
514 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
515                                const struct sock *sk)
516 {
517         const struct iphdr *iph = ip_hdr(skb);
518         int oif = skb->dev->ifindex;
519         u8 tos = RT_TOS(iph->tos);
520         u8 prot = iph->protocol;
521         u32 mark = skb->mark;
522 
523         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
524 }
525 
526 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
527 {
528         const struct inet_sock *inet = inet_sk(sk);
529         const struct ip_options_rcu *inet_opt;
530         __be32 daddr = inet->inet_daddr;
531 
532         rcu_read_lock();
533         inet_opt = rcu_dereference(inet->inet_opt);
534         if (inet_opt && inet_opt->opt.srr)
535                 daddr = inet_opt->opt.faddr;
536         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
537                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
538                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
539                            inet_sk_flowi_flags(sk),
540                            daddr, inet->inet_saddr, 0, 0);
541         rcu_read_unlock();
542 }
543 
544 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
545                                  const struct sk_buff *skb)
546 {
547         if (skb)
548                 build_skb_flow_key(fl4, skb, sk);
549         else
550                 build_sk_flow_key(fl4, sk);
551 }
552 
553 static inline void rt_free(struct rtable *rt)
554 {
555         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
556 }
557 
558 static DEFINE_SPINLOCK(fnhe_lock);
559 
560 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
561 {
562         struct rtable *rt;
563 
564         rt = rcu_dereference(fnhe->fnhe_rth_input);
565         if (rt) {
566                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
567                 rt_free(rt);
568         }
569         rt = rcu_dereference(fnhe->fnhe_rth_output);
570         if (rt) {
571                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
572                 rt_free(rt);
573         }
574 }
575 
576 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
577 {
578         struct fib_nh_exception *fnhe, *oldest;
579 
580         oldest = rcu_dereference(hash->chain);
581         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
582              fnhe = rcu_dereference(fnhe->fnhe_next)) {
583                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
584                         oldest = fnhe;
585         }
586         fnhe_flush_routes(oldest);
587         return oldest;
588 }
589 
590 static inline u32 fnhe_hashfun(__be32 daddr)
591 {
592         u32 hval;
593 
594         hval = (__force u32) daddr;
595         hval ^= (hval >> 11) ^ (hval >> 22);
596 
597         return hval & (FNHE_HASH_SIZE - 1);
598 }
599 
600 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
601 {
602         rt->rt_pmtu = fnhe->fnhe_pmtu;
603         rt->dst.expires = fnhe->fnhe_expires;
604 
605         if (fnhe->fnhe_gw) {
606                 rt->rt_flags |= RTCF_REDIRECTED;
607                 rt->rt_gateway = fnhe->fnhe_gw;
608                 rt->rt_uses_gateway = 1;
609         }
610 }
611 
612 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
613                                   u32 pmtu, unsigned long expires)
614 {
615         struct fnhe_hash_bucket *hash;
616         struct fib_nh_exception *fnhe;
617         struct rtable *rt;
618         unsigned int i;
619         int depth;
620         u32 hval = fnhe_hashfun(daddr);
621 
622         spin_lock_bh(&fnhe_lock);
623 
624         hash = nh->nh_exceptions;
625         if (!hash) {
626                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
627                 if (!hash)
628                         goto out_unlock;
629                 nh->nh_exceptions = hash;
630         }
631 
632         hash += hval;
633 
634         depth = 0;
635         for (fnhe = rcu_dereference(hash->chain); fnhe;
636              fnhe = rcu_dereference(fnhe->fnhe_next)) {
637                 if (fnhe->fnhe_daddr == daddr)
638                         break;
639                 depth++;
640         }
641 
642         if (fnhe) {
643                 if (gw)
644                         fnhe->fnhe_gw = gw;
645                 if (pmtu) {
646                         fnhe->fnhe_pmtu = pmtu;
647                         fnhe->fnhe_expires = max(1UL, expires);
648                 }
649                 /* Update all cached dsts too */
650                 rt = rcu_dereference(fnhe->fnhe_rth_input);
651                 if (rt)
652                         fill_route_from_fnhe(rt, fnhe);
653                 rt = rcu_dereference(fnhe->fnhe_rth_output);
654                 if (rt)
655                         fill_route_from_fnhe(rt, fnhe);
656         } else {
657                 if (depth > FNHE_RECLAIM_DEPTH)
658                         fnhe = fnhe_oldest(hash);
659                 else {
660                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
661                         if (!fnhe)
662                                 goto out_unlock;
663 
664                         fnhe->fnhe_next = hash->chain;
665                         rcu_assign_pointer(hash->chain, fnhe);
666                 }
667                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
668                 fnhe->fnhe_daddr = daddr;
669                 fnhe->fnhe_gw = gw;
670                 fnhe->fnhe_pmtu = pmtu;
671                 fnhe->fnhe_expires = expires;
672 
673                 /* Exception created; mark the cached routes for the nexthop
674                  * stale, so anyone caching it rechecks if this exception
675                  * applies to them.
676                  */
677                 rt = rcu_dereference(nh->nh_rth_input);
678                 if (rt)
679                         rt->dst.obsolete = DST_OBSOLETE_KILL;
680 
681                 for_each_possible_cpu(i) {
682                         struct rtable __rcu **prt;
683                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
684                         rt = rcu_dereference(*prt);
685                         if (rt)
686                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
687                 }
688         }
689 
690         fnhe->fnhe_stamp = jiffies;
691 
692 out_unlock:
693         spin_unlock_bh(&fnhe_lock);
694 }
695 
696 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
697                              bool kill_route)
698 {
699         __be32 new_gw = icmp_hdr(skb)->un.gateway;
700         __be32 old_gw = ip_hdr(skb)->saddr;
701         struct net_device *dev = skb->dev;
702         struct in_device *in_dev;
703         struct fib_result res;
704         struct neighbour *n;
705         struct net *net;
706 
707         switch (icmp_hdr(skb)->code & 7) {
708         case ICMP_REDIR_NET:
709         case ICMP_REDIR_NETTOS:
710         case ICMP_REDIR_HOST:
711         case ICMP_REDIR_HOSTTOS:
712                 break;
713 
714         default:
715                 return;
716         }
717 
718         if (rt->rt_gateway != old_gw)
719                 return;
720 
721         in_dev = __in_dev_get_rcu(dev);
722         if (!in_dev)
723                 return;
724 
725         net = dev_net(dev);
726         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
727             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
728             ipv4_is_zeronet(new_gw))
729                 goto reject_redirect;
730 
731         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
732                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
733                         goto reject_redirect;
734                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
735                         goto reject_redirect;
736         } else {
737                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
738                         goto reject_redirect;
739         }
740 
741         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
742         if (n) {
743                 if (!(n->nud_state & NUD_VALID)) {
744                         neigh_event_send(n, NULL);
745                 } else {
746                         if (fib_lookup(net, fl4, &res) == 0) {
747                                 struct fib_nh *nh = &FIB_RES_NH(res);
748 
749                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
750                                                       0, 0);
751                         }
752                         if (kill_route)
753                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
754                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
755                 }
756                 neigh_release(n);
757         }
758         return;
759 
760 reject_redirect:
761 #ifdef CONFIG_IP_ROUTE_VERBOSE
762         if (IN_DEV_LOG_MARTIANS(in_dev)) {
763                 const struct iphdr *iph = (const struct iphdr *) skb->data;
764                 __be32 daddr = iph->daddr;
765                 __be32 saddr = iph->saddr;
766 
767                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
768                                      "  Advised path = %pI4 -> %pI4\n",
769                                      &old_gw, dev->name, &new_gw,
770                                      &saddr, &daddr);
771         }
772 #endif
773         ;
774 }
775 
776 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
777 {
778         struct rtable *rt;
779         struct flowi4 fl4;
780         const struct iphdr *iph = (const struct iphdr *) skb->data;
781         int oif = skb->dev->ifindex;
782         u8 tos = RT_TOS(iph->tos);
783         u8 prot = iph->protocol;
784         u32 mark = skb->mark;
785 
786         rt = (struct rtable *) dst;
787 
788         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
789         __ip_do_redirect(rt, skb, &fl4, true);
790 }
791 
792 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
793 {
794         struct rtable *rt = (struct rtable *)dst;
795         struct dst_entry *ret = dst;
796 
797         if (rt) {
798                 if (dst->obsolete > 0) {
799                         ip_rt_put(rt);
800                         ret = NULL;
801                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
802                            rt->dst.expires) {
803                         ip_rt_put(rt);
804                         ret = NULL;
805                 }
806         }
807         return ret;
808 }
809 
810 /*
811  * Algorithm:
812  *      1. The first ip_rt_redirect_number redirects are sent
813  *         with exponential backoff, then we stop sending them at all,
814  *         assuming that the host ignores our redirects.
815  *      2. If we did not see packets requiring redirects
816  *         during ip_rt_redirect_silence, we assume that the host
817  *         forgot redirected route and start to send redirects again.
818  *
819  * This algorithm is much cheaper and more intelligent than dumb load limiting
820  * in icmp.c.
821  *
822  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
823  * and "frag. need" (breaks PMTU discovery) in icmp.c.
824  */
825 
826 void ip_rt_send_redirect(struct sk_buff *skb)
827 {
828         struct rtable *rt = skb_rtable(skb);
829         struct in_device *in_dev;
830         struct inet_peer *peer;
831         struct net *net;
832         int log_martians;
833 
834         rcu_read_lock();
835         in_dev = __in_dev_get_rcu(rt->dst.dev);
836         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
837                 rcu_read_unlock();
838                 return;
839         }
840         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
841         rcu_read_unlock();
842 
843         net = dev_net(rt->dst.dev);
844         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
845         if (!peer) {
846                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
847                           rt_nexthop(rt, ip_hdr(skb)->daddr));
848                 return;
849         }
850 
851         /* No redirected packets during ip_rt_redirect_silence;
852          * reset the algorithm.
853          */
854         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
855                 peer->rate_tokens = 0;
856 
857         /* Too many ignored redirects; do not send anything
858          * set dst.rate_last to the last seen redirected packet.
859          */
860         if (peer->rate_tokens >= ip_rt_redirect_number) {
861                 peer->rate_last = jiffies;
862                 goto out_put_peer;
863         }
864 
865         /* Check for load limit; set rate_last to the latest sent
866          * redirect.
867          */
868         if (peer->rate_tokens == 0 ||
869             time_after(jiffies,
870                        (peer->rate_last +
871                         (ip_rt_redirect_load << peer->rate_tokens)))) {
872                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
873 
874                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
875                 peer->rate_last = jiffies;
876                 ++peer->rate_tokens;
877 #ifdef CONFIG_IP_ROUTE_VERBOSE
878                 if (log_martians &&
879                     peer->rate_tokens == ip_rt_redirect_number)
880                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
881                                              &ip_hdr(skb)->saddr, inet_iif(skb),
882                                              &ip_hdr(skb)->daddr, &gw);
883 #endif
884         }
885 out_put_peer:
886         inet_putpeer(peer);
887 }
888 
889 static int ip_error(struct sk_buff *skb)
890 {
891         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
892         struct rtable *rt = skb_rtable(skb);
893         struct inet_peer *peer;
894         unsigned long now;
895         struct net *net;
896         bool send;
897         int code;
898 
899         net = dev_net(rt->dst.dev);
900         if (!IN_DEV_FORWARD(in_dev)) {
901                 switch (rt->dst.error) {
902                 case EHOSTUNREACH:
903                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
904                         break;
905 
906                 case ENETUNREACH:
907                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
908                         break;
909                 }
910                 goto out;
911         }
912 
913         switch (rt->dst.error) {
914         case EINVAL:
915         default:
916                 goto out;
917         case EHOSTUNREACH:
918                 code = ICMP_HOST_UNREACH;
919                 break;
920         case ENETUNREACH:
921                 code = ICMP_NET_UNREACH;
922                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
923                 break;
924         case EACCES:
925                 code = ICMP_PKT_FILTERED;
926                 break;
927         }
928 
929         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
930 
931         send = true;
932         if (peer) {
933                 now = jiffies;
934                 peer->rate_tokens += now - peer->rate_last;
935                 if (peer->rate_tokens > ip_rt_error_burst)
936                         peer->rate_tokens = ip_rt_error_burst;
937                 peer->rate_last = now;
938                 if (peer->rate_tokens >= ip_rt_error_cost)
939                         peer->rate_tokens -= ip_rt_error_cost;
940                 else
941                         send = false;
942                 inet_putpeer(peer);
943         }
944         if (send)
945                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
946 
947 out:    kfree_skb(skb);
948         return 0;
949 }
950 
951 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
952 {
953         struct dst_entry *dst = &rt->dst;
954         struct fib_result res;
955 
956         if (dst_metric_locked(dst, RTAX_MTU))
957                 return;
958 
959         if (dst->dev->mtu < mtu)
960                 return;
961 
962         if (mtu < ip_rt_min_pmtu)
963                 mtu = ip_rt_min_pmtu;
964 
965         if (rt->rt_pmtu == mtu &&
966             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
967                 return;
968 
969         rcu_read_lock();
970         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
971                 struct fib_nh *nh = &FIB_RES_NH(res);
972 
973                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
974                                       jiffies + ip_rt_mtu_expires);
975         }
976         rcu_read_unlock();
977 }
978 
979 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
980                               struct sk_buff *skb, u32 mtu)
981 {
982         struct rtable *rt = (struct rtable *) dst;
983         struct flowi4 fl4;
984 
985         ip_rt_build_flow_key(&fl4, sk, skb);
986         __ip_rt_update_pmtu(rt, &fl4, mtu);
987 }
988 
989 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
990                       int oif, u32 mark, u8 protocol, int flow_flags)
991 {
992         const struct iphdr *iph = (const struct iphdr *) skb->data;
993         struct flowi4 fl4;
994         struct rtable *rt;
995 
996         __build_flow_key(&fl4, NULL, iph, oif,
997                          RT_TOS(iph->tos), protocol, mark, flow_flags);
998         rt = __ip_route_output_key(net, &fl4);
999         if (!IS_ERR(rt)) {
1000                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1001                 ip_rt_put(rt);
1002         }
1003 }
1004 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1005 
1006 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1007 {
1008         const struct iphdr *iph = (const struct iphdr *) skb->data;
1009         struct flowi4 fl4;
1010         struct rtable *rt;
1011 
1012         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1013         rt = __ip_route_output_key(sock_net(sk), &fl4);
1014         if (!IS_ERR(rt)) {
1015                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1016                 ip_rt_put(rt);
1017         }
1018 }
1019 
1020 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021 {
1022         const struct iphdr *iph = (const struct iphdr *) skb->data;
1023         struct flowi4 fl4;
1024         struct rtable *rt;
1025         struct dst_entry *dst;
1026         bool new = false;
1027 
1028         bh_lock_sock(sk);
1029 
1030         if (!ip_sk_accept_pmtu(sk))
1031                 goto out;
1032 
1033         rt = (struct rtable *) __sk_dst_get(sk);
1034 
1035         if (sock_owned_by_user(sk) || !rt) {
1036                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1037                 goto out;
1038         }
1039 
1040         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1041 
1042         if (!__sk_dst_check(sk, 0)) {
1043                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1044                 if (IS_ERR(rt))
1045                         goto out;
1046 
1047                 new = true;
1048         }
1049 
1050         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1051 
1052         dst = dst_check(&rt->dst, 0);
1053         if (!dst) {
1054                 if (new)
1055                         dst_release(&rt->dst);
1056 
1057                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1058                 if (IS_ERR(rt))
1059                         goto out;
1060 
1061                 new = true;
1062         }
1063 
1064         if (new)
1065                 __sk_dst_set(sk, &rt->dst);
1066 
1067 out:
1068         bh_unlock_sock(sk);
1069 }
1070 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1071 
1072 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1073                    int oif, u32 mark, u8 protocol, int flow_flags)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078 
1079         __build_flow_key(&fl4, NULL, iph, oif,
1080                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1081         rt = __ip_route_output_key(net, &fl4);
1082         if (!IS_ERR(rt)) {
1083                 __ip_do_redirect(rt, skb, &fl4, false);
1084                 ip_rt_put(rt);
1085         }
1086 }
1087 EXPORT_SYMBOL_GPL(ipv4_redirect);
1088 
1089 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1090 {
1091         const struct iphdr *iph = (const struct iphdr *) skb->data;
1092         struct flowi4 fl4;
1093         struct rtable *rt;
1094 
1095         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1096         rt = __ip_route_output_key(sock_net(sk), &fl4);
1097         if (!IS_ERR(rt)) {
1098                 __ip_do_redirect(rt, skb, &fl4, false);
1099                 ip_rt_put(rt);
1100         }
1101 }
1102 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1103 
1104 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1105 {
1106         struct rtable *rt = (struct rtable *) dst;
1107 
1108         /* All IPV4 dsts are created with ->obsolete set to the value
1109          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1110          * into this function always.
1111          *
1112          * When a PMTU/redirect information update invalidates a route,
1113          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1114          * DST_OBSOLETE_DEAD by dst_free().
1115          */
1116         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1117                 return NULL;
1118         return dst;
1119 }
1120 
1121 static void ipv4_link_failure(struct sk_buff *skb)
1122 {
1123         struct rtable *rt;
1124 
1125         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1126 
1127         rt = skb_rtable(skb);
1128         if (rt)
1129                 dst_set_expires(&rt->dst, 0);
1130 }
1131 
1132 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1133 {
1134         pr_debug("%s: %pI4 -> %pI4, %s\n",
1135                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1136                  skb->dev ? skb->dev->name : "?");
1137         kfree_skb(skb);
1138         WARN_ON(1);
1139         return 0;
1140 }
1141 
1142 /*
1143    We do not cache source address of outgoing interface,
1144    because it is used only by IP RR, TS and SRR options,
1145    so that it out of fast path.
1146 
1147    BTW remember: "addr" is allowed to be not aligned
1148    in IP options!
1149  */
1150 
1151 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1152 {
1153         __be32 src;
1154 
1155         if (rt_is_output_route(rt))
1156                 src = ip_hdr(skb)->saddr;
1157         else {
1158                 struct fib_result res;
1159                 struct flowi4 fl4;
1160                 struct iphdr *iph;
1161 
1162                 iph = ip_hdr(skb);
1163 
1164                 memset(&fl4, 0, sizeof(fl4));
1165                 fl4.daddr = iph->daddr;
1166                 fl4.saddr = iph->saddr;
1167                 fl4.flowi4_tos = RT_TOS(iph->tos);
1168                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1169                 fl4.flowi4_iif = skb->dev->ifindex;
1170                 fl4.flowi4_mark = skb->mark;
1171 
1172                 rcu_read_lock();
1173                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1174                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1175                 else
1176                         src = inet_select_addr(rt->dst.dev,
1177                                                rt_nexthop(rt, iph->daddr),
1178                                                RT_SCOPE_UNIVERSE);
1179                 rcu_read_unlock();
1180         }
1181         memcpy(addr, &src, 4);
1182 }
1183 
1184 #ifdef CONFIG_IP_ROUTE_CLASSID
1185 static void set_class_tag(struct rtable *rt, u32 tag)
1186 {
1187         if (!(rt->dst.tclassid & 0xFFFF))
1188                 rt->dst.tclassid |= tag & 0xFFFF;
1189         if (!(rt->dst.tclassid & 0xFFFF0000))
1190                 rt->dst.tclassid |= tag & 0xFFFF0000;
1191 }
1192 #endif
1193 
1194 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1195 {
1196         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1197 
1198         if (advmss == 0) {
1199                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1200                                ip_rt_min_advmss);
1201                 if (advmss > 65535 - 40)
1202                         advmss = 65535 - 40;
1203         }
1204         return advmss;
1205 }
1206 
1207 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1208 {
1209         const struct rtable *rt = (const struct rtable *) dst;
1210         unsigned int mtu = rt->rt_pmtu;
1211 
1212         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1213                 mtu = dst_metric_raw(dst, RTAX_MTU);
1214 
1215         if (mtu)
1216                 return mtu;
1217 
1218         mtu = dst->dev->mtu;
1219 
1220         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1221                 if (rt->rt_uses_gateway && mtu > 576)
1222                         mtu = 576;
1223         }
1224 
1225         return min_t(unsigned int, mtu, IP_MAX_MTU);
1226 }
1227 
1228 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1229 {
1230         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1231         struct fib_nh_exception *fnhe;
1232         u32 hval;
1233 
1234         if (!hash)
1235                 return NULL;
1236 
1237         hval = fnhe_hashfun(daddr);
1238 
1239         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1240              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1241                 if (fnhe->fnhe_daddr == daddr)
1242                         return fnhe;
1243         }
1244         return NULL;
1245 }
1246 
1247 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1248                               __be32 daddr)
1249 {
1250         bool ret = false;
1251 
1252         spin_lock_bh(&fnhe_lock);
1253 
1254         if (daddr == fnhe->fnhe_daddr) {
1255                 struct rtable __rcu **porig;
1256                 struct rtable *orig;
1257                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1258 
1259                 if (rt_is_input_route(rt))
1260                         porig = &fnhe->fnhe_rth_input;
1261                 else
1262                         porig = &fnhe->fnhe_rth_output;
1263                 orig = rcu_dereference(*porig);
1264 
1265                 if (fnhe->fnhe_genid != genid) {
1266                         fnhe->fnhe_genid = genid;
1267                         fnhe->fnhe_gw = 0;
1268                         fnhe->fnhe_pmtu = 0;
1269                         fnhe->fnhe_expires = 0;
1270                         fnhe_flush_routes(fnhe);
1271                         orig = NULL;
1272                 }
1273                 fill_route_from_fnhe(rt, fnhe);
1274                 if (!rt->rt_gateway)
1275                         rt->rt_gateway = daddr;
1276 
1277                 if (!(rt->dst.flags & DST_NOCACHE)) {
1278                         rcu_assign_pointer(*porig, rt);
1279                         if (orig)
1280                                 rt_free(orig);
1281                         ret = true;
1282                 }
1283 
1284                 fnhe->fnhe_stamp = jiffies;
1285         }
1286         spin_unlock_bh(&fnhe_lock);
1287 
1288         return ret;
1289 }
1290 
1291 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1292 {
1293         struct rtable *orig, *prev, **p;
1294         bool ret = true;
1295 
1296         if (rt_is_input_route(rt)) {
1297                 p = (struct rtable **)&nh->nh_rth_input;
1298         } else {
1299                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1300         }
1301         orig = *p;
1302 
1303         prev = cmpxchg(p, orig, rt);
1304         if (prev == orig) {
1305                 if (orig)
1306                         rt_free(orig);
1307         } else
1308                 ret = false;
1309 
1310         return ret;
1311 }
1312 
1313 static DEFINE_SPINLOCK(rt_uncached_lock);
1314 static LIST_HEAD(rt_uncached_list);
1315 
1316 static void rt_add_uncached_list(struct rtable *rt)
1317 {
1318         spin_lock_bh(&rt_uncached_lock);
1319         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1320         spin_unlock_bh(&rt_uncached_lock);
1321 }
1322 
1323 static void ipv4_dst_destroy(struct dst_entry *dst)
1324 {
1325         struct rtable *rt = (struct rtable *) dst;
1326 
1327         if (!list_empty(&rt->rt_uncached)) {
1328                 spin_lock_bh(&rt_uncached_lock);
1329                 list_del(&rt->rt_uncached);
1330                 spin_unlock_bh(&rt_uncached_lock);
1331         }
1332 }
1333 
1334 void rt_flush_dev(struct net_device *dev)
1335 {
1336         if (!list_empty(&rt_uncached_list)) {
1337                 struct net *net = dev_net(dev);
1338                 struct rtable *rt;
1339 
1340                 spin_lock_bh(&rt_uncached_lock);
1341                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1342                         if (rt->dst.dev != dev)
1343                                 continue;
1344                         rt->dst.dev = net->loopback_dev;
1345                         dev_hold(rt->dst.dev);
1346                         dev_put(dev);
1347                 }
1348                 spin_unlock_bh(&rt_uncached_lock);
1349         }
1350 }
1351 
1352 static bool rt_cache_valid(const struct rtable *rt)
1353 {
1354         return  rt &&
1355                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1356                 !rt_is_expired(rt);
1357 }
1358 
1359 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1360                            const struct fib_result *res,
1361                            struct fib_nh_exception *fnhe,
1362                            struct fib_info *fi, u16 type, u32 itag)
1363 {
1364         bool cached = false;
1365 
1366         if (fi) {
1367                 struct fib_nh *nh = &FIB_RES_NH(*res);
1368 
1369                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1370                         rt->rt_gateway = nh->nh_gw;
1371                         rt->rt_uses_gateway = 1;
1372                 }
1373                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1374 #ifdef CONFIG_IP_ROUTE_CLASSID
1375                 rt->dst.tclassid = nh->nh_tclassid;
1376 #endif
1377                 if (unlikely(fnhe))
1378                         cached = rt_bind_exception(rt, fnhe, daddr);
1379                 else if (!(rt->dst.flags & DST_NOCACHE))
1380                         cached = rt_cache_route(nh, rt);
1381                 if (unlikely(!cached)) {
1382                         /* Routes we intend to cache in nexthop exception or
1383                          * FIB nexthop have the DST_NOCACHE bit clear.
1384                          * However, if we are unsuccessful at storing this
1385                          * route into the cache we really need to set it.
1386                          */
1387                         rt->dst.flags |= DST_NOCACHE;
1388                         if (!rt->rt_gateway)
1389                                 rt->rt_gateway = daddr;
1390                         rt_add_uncached_list(rt);
1391                 }
1392         } else
1393                 rt_add_uncached_list(rt);
1394 
1395 #ifdef CONFIG_IP_ROUTE_CLASSID
1396 #ifdef CONFIG_IP_MULTIPLE_TABLES
1397         set_class_tag(rt, res->tclassid);
1398 #endif
1399         set_class_tag(rt, itag);
1400 #endif
1401 }
1402 
1403 static struct rtable *rt_dst_alloc(struct net_device *dev,
1404                                    bool nopolicy, bool noxfrm, bool will_cache)
1405 {
1406         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1407                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1408                          (nopolicy ? DST_NOPOLICY : 0) |
1409                          (noxfrm ? DST_NOXFRM : 0));
1410 }
1411 
1412 /* called in rcu_read_lock() section */
1413 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1414                                 u8 tos, struct net_device *dev, int our)
1415 {
1416         struct rtable *rth;
1417         struct in_device *in_dev = __in_dev_get_rcu(dev);
1418         u32 itag = 0;
1419         int err;
1420 
1421         /* Primary sanity checks. */
1422 
1423         if (in_dev == NULL)
1424                 return -EINVAL;
1425 
1426         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1427             skb->protocol != htons(ETH_P_IP))
1428                 goto e_inval;
1429 
1430         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1431                 if (ipv4_is_loopback(saddr))
1432                         goto e_inval;
1433 
1434         if (ipv4_is_zeronet(saddr)) {
1435                 if (!ipv4_is_local_multicast(daddr))
1436                         goto e_inval;
1437         } else {
1438                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1439                                           in_dev, &itag);
1440                 if (err < 0)
1441                         goto e_err;
1442         }
1443         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1444                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1445         if (!rth)
1446                 goto e_nobufs;
1447 
1448 #ifdef CONFIG_IP_ROUTE_CLASSID
1449         rth->dst.tclassid = itag;
1450 #endif
1451         rth->dst.output = ip_rt_bug;
1452 
1453         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1454         rth->rt_flags   = RTCF_MULTICAST;
1455         rth->rt_type    = RTN_MULTICAST;
1456         rth->rt_is_input= 1;
1457         rth->rt_iif     = 0;
1458         rth->rt_pmtu    = 0;
1459         rth->rt_gateway = 0;
1460         rth->rt_uses_gateway = 0;
1461         INIT_LIST_HEAD(&rth->rt_uncached);
1462         if (our) {
1463                 rth->dst.input= ip_local_deliver;
1464                 rth->rt_flags |= RTCF_LOCAL;
1465         }
1466 
1467 #ifdef CONFIG_IP_MROUTE
1468         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1469                 rth->dst.input = ip_mr_input;
1470 #endif
1471         RT_CACHE_STAT_INC(in_slow_mc);
1472 
1473         skb_dst_set(skb, &rth->dst);
1474         return 0;
1475 
1476 e_nobufs:
1477         return -ENOBUFS;
1478 e_inval:
1479         return -EINVAL;
1480 e_err:
1481         return err;
1482 }
1483 
1484 
1485 static void ip_handle_martian_source(struct net_device *dev,
1486                                      struct in_device *in_dev,
1487                                      struct sk_buff *skb,
1488                                      __be32 daddr,
1489                                      __be32 saddr)
1490 {
1491         RT_CACHE_STAT_INC(in_martian_src);
1492 #ifdef CONFIG_IP_ROUTE_VERBOSE
1493         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1494                 /*
1495                  *      RFC1812 recommendation, if source is martian,
1496                  *      the only hint is MAC header.
1497                  */
1498                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1499                         &daddr, &saddr, dev->name);
1500                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1501                         print_hex_dump(KERN_WARNING, "ll header: ",
1502                                        DUMP_PREFIX_OFFSET, 16, 1,
1503                                        skb_mac_header(skb),
1504                                        dev->hard_header_len, true);
1505                 }
1506         }
1507 #endif
1508 }
1509 
1510 /* called in rcu_read_lock() section */
1511 static int __mkroute_input(struct sk_buff *skb,
1512                            const struct fib_result *res,
1513                            struct in_device *in_dev,
1514                            __be32 daddr, __be32 saddr, u32 tos)
1515 {
1516         struct fib_nh_exception *fnhe;
1517         struct rtable *rth;
1518         int err;
1519         struct in_device *out_dev;
1520         unsigned int flags = 0;
1521         bool do_cache;
1522         u32 itag = 0;
1523 
1524         /* get a working reference to the output device */
1525         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1526         if (out_dev == NULL) {
1527                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1528                 return -EINVAL;
1529         }
1530 
1531         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1532                                   in_dev->dev, in_dev, &itag);
1533         if (err < 0) {
1534                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1535                                          saddr);
1536 
1537                 goto cleanup;
1538         }
1539 
1540         do_cache = res->fi && !itag;
1541         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1542             (IN_DEV_SHARED_MEDIA(out_dev) ||
1543              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1544                 flags |= RTCF_DOREDIRECT;
1545                 do_cache = false;
1546         }
1547 
1548         if (skb->protocol != htons(ETH_P_IP)) {
1549                 /* Not IP (i.e. ARP). Do not create route, if it is
1550                  * invalid for proxy arp. DNAT routes are always valid.
1551                  *
1552                  * Proxy arp feature have been extended to allow, ARP
1553                  * replies back to the same interface, to support
1554                  * Private VLAN switch technologies. See arp.c.
1555                  */
1556                 if (out_dev == in_dev &&
1557                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1558                         err = -EINVAL;
1559                         goto cleanup;
1560                 }
1561         }
1562 
1563         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1564         if (do_cache) {
1565                 if (fnhe != NULL)
1566                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1567                 else
1568                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1569 
1570                 if (rt_cache_valid(rth)) {
1571                         skb_dst_set_noref(skb, &rth->dst);
1572                         goto out;
1573                 }
1574         }
1575 
1576         rth = rt_dst_alloc(out_dev->dev,
1577                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1578                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1579         if (!rth) {
1580                 err = -ENOBUFS;
1581                 goto cleanup;
1582         }
1583 
1584         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1585         rth->rt_flags = flags;
1586         rth->rt_type = res->type;
1587         rth->rt_is_input = 1;
1588         rth->rt_iif     = 0;
1589         rth->rt_pmtu    = 0;
1590         rth->rt_gateway = 0;
1591         rth->rt_uses_gateway = 0;
1592         INIT_LIST_HEAD(&rth->rt_uncached);
1593         RT_CACHE_STAT_INC(in_slow_tot);
1594 
1595         rth->dst.input = ip_forward;
1596         rth->dst.output = ip_output;
1597 
1598         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1599         skb_dst_set(skb, &rth->dst);
1600 out:
1601         err = 0;
1602  cleanup:
1603         return err;
1604 }
1605 
1606 static int ip_mkroute_input(struct sk_buff *skb,
1607                             struct fib_result *res,
1608                             const struct flowi4 *fl4,
1609                             struct in_device *in_dev,
1610                             __be32 daddr, __be32 saddr, u32 tos)
1611 {
1612 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1613         if (res->fi && res->fi->fib_nhs > 1)
1614                 fib_select_multipath(res);
1615 #endif
1616 
1617         /* create a routing cache entry */
1618         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1619 }
1620 
1621 /*
1622  *      NOTE. We drop all the packets that has local source
1623  *      addresses, because every properly looped back packet
1624  *      must have correct destination already attached by output routine.
1625  *
1626  *      Such approach solves two big problems:
1627  *      1. Not simplex devices are handled properly.
1628  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1629  *      called with rcu_read_lock()
1630  */
1631 
1632 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1633                                u8 tos, struct net_device *dev)
1634 {
1635         struct fib_result res;
1636         struct in_device *in_dev = __in_dev_get_rcu(dev);
1637         struct flowi4   fl4;
1638         unsigned int    flags = 0;
1639         u32             itag = 0;
1640         struct rtable   *rth;
1641         int             err = -EINVAL;
1642         struct net    *net = dev_net(dev);
1643         bool do_cache;
1644 
1645         /* IP on this device is disabled. */
1646 
1647         if (!in_dev)
1648                 goto out;
1649 
1650         /* Check for the most weird martians, which can be not detected
1651            by fib_lookup.
1652          */
1653 
1654         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1655                 goto martian_source;
1656 
1657         res.fi = NULL;
1658         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1659                 goto brd_input;
1660 
1661         /* Accept zero addresses only to limited broadcast;
1662          * I even do not know to fix it or not. Waiting for complains :-)
1663          */
1664         if (ipv4_is_zeronet(saddr))
1665                 goto martian_source;
1666 
1667         if (ipv4_is_zeronet(daddr))
1668                 goto martian_destination;
1669 
1670         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1671          * and call it once if daddr or/and saddr are loopback addresses
1672          */
1673         if (ipv4_is_loopback(daddr)) {
1674                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1675                         goto martian_destination;
1676         } else if (ipv4_is_loopback(saddr)) {
1677                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1678                         goto martian_source;
1679         }
1680 
1681         /*
1682          *      Now we are ready to route packet.
1683          */
1684         fl4.flowi4_oif = 0;
1685         fl4.flowi4_iif = dev->ifindex;
1686         fl4.flowi4_mark = skb->mark;
1687         fl4.flowi4_tos = tos;
1688         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1689         fl4.daddr = daddr;
1690         fl4.saddr = saddr;
1691         err = fib_lookup(net, &fl4, &res);
1692         if (err != 0) {
1693                 if (!IN_DEV_FORWARD(in_dev))
1694                         err = -EHOSTUNREACH;
1695                 goto no_route;
1696         }
1697 
1698         if (res.type == RTN_BROADCAST)
1699                 goto brd_input;
1700 
1701         if (res.type == RTN_LOCAL) {
1702                 err = fib_validate_source(skb, saddr, daddr, tos,
1703                                           0, dev, in_dev, &itag);
1704                 if (err < 0)
1705                         goto martian_source_keep_err;
1706                 goto local_input;
1707         }
1708 
1709         if (!IN_DEV_FORWARD(in_dev)) {
1710                 err = -EHOSTUNREACH;
1711                 goto no_route;
1712         }
1713         if (res.type != RTN_UNICAST)
1714                 goto martian_destination;
1715 
1716         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1717 out:    return err;
1718 
1719 brd_input:
1720         if (skb->protocol != htons(ETH_P_IP))
1721                 goto e_inval;
1722 
1723         if (!ipv4_is_zeronet(saddr)) {
1724                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1725                                           in_dev, &itag);
1726                 if (err < 0)
1727                         goto martian_source_keep_err;
1728         }
1729         flags |= RTCF_BROADCAST;
1730         res.type = RTN_BROADCAST;
1731         RT_CACHE_STAT_INC(in_brd);
1732 
1733 local_input:
1734         do_cache = false;
1735         if (res.fi) {
1736                 if (!itag) {
1737                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1738                         if (rt_cache_valid(rth)) {
1739                                 skb_dst_set_noref(skb, &rth->dst);
1740                                 err = 0;
1741                                 goto out;
1742                         }
1743                         do_cache = true;
1744                 }
1745         }
1746 
1747         rth = rt_dst_alloc(net->loopback_dev,
1748                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1749         if (!rth)
1750                 goto e_nobufs;
1751 
1752         rth->dst.input= ip_local_deliver;
1753         rth->dst.output= ip_rt_bug;
1754 #ifdef CONFIG_IP_ROUTE_CLASSID
1755         rth->dst.tclassid = itag;
1756 #endif
1757 
1758         rth->rt_genid = rt_genid_ipv4(net);
1759         rth->rt_flags   = flags|RTCF_LOCAL;
1760         rth->rt_type    = res.type;
1761         rth->rt_is_input = 1;
1762         rth->rt_iif     = 0;
1763         rth->rt_pmtu    = 0;
1764         rth->rt_gateway = 0;
1765         rth->rt_uses_gateway = 0;
1766         INIT_LIST_HEAD(&rth->rt_uncached);
1767         RT_CACHE_STAT_INC(in_slow_tot);
1768         if (res.type == RTN_UNREACHABLE) {
1769                 rth->dst.input= ip_error;
1770                 rth->dst.error= -err;
1771                 rth->rt_flags   &= ~RTCF_LOCAL;
1772         }
1773         if (do_cache) {
1774                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1775                         rth->dst.flags |= DST_NOCACHE;
1776                         rt_add_uncached_list(rth);
1777                 }
1778         }
1779         skb_dst_set(skb, &rth->dst);
1780         err = 0;
1781         goto out;
1782 
1783 no_route:
1784         RT_CACHE_STAT_INC(in_no_route);
1785         res.type = RTN_UNREACHABLE;
1786         if (err == -ESRCH)
1787                 err = -ENETUNREACH;
1788         goto local_input;
1789 
1790         /*
1791          *      Do not cache martian addresses: they should be logged (RFC1812)
1792          */
1793 martian_destination:
1794         RT_CACHE_STAT_INC(in_martian_dst);
1795 #ifdef CONFIG_IP_ROUTE_VERBOSE
1796         if (IN_DEV_LOG_MARTIANS(in_dev))
1797                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1798                                      &daddr, &saddr, dev->name);
1799 #endif
1800 
1801 e_inval:
1802         err = -EINVAL;
1803         goto out;
1804 
1805 e_nobufs:
1806         err = -ENOBUFS;
1807         goto out;
1808 
1809 martian_source:
1810         err = -EINVAL;
1811 martian_source_keep_err:
1812         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1813         goto out;
1814 }
1815 
1816 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1817                          u8 tos, struct net_device *dev)
1818 {
1819         int res;
1820 
1821         rcu_read_lock();
1822 
1823         /* Multicast recognition logic is moved from route cache to here.
1824            The problem was that too many Ethernet cards have broken/missing
1825            hardware multicast filters :-( As result the host on multicasting
1826            network acquires a lot of useless route cache entries, sort of
1827            SDR messages from all the world. Now we try to get rid of them.
1828            Really, provided software IP multicast filter is organized
1829            reasonably (at least, hashed), it does not result in a slowdown
1830            comparing with route cache reject entries.
1831            Note, that multicast routers are not affected, because
1832            route cache entry is created eventually.
1833          */
1834         if (ipv4_is_multicast(daddr)) {
1835                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1836 
1837                 if (in_dev) {
1838                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1839                                                   ip_hdr(skb)->protocol);
1840                         if (our
1841 #ifdef CONFIG_IP_MROUTE
1842                                 ||
1843                             (!ipv4_is_local_multicast(daddr) &&
1844                              IN_DEV_MFORWARD(in_dev))
1845 #endif
1846                            ) {
1847                                 int res = ip_route_input_mc(skb, daddr, saddr,
1848                                                             tos, dev, our);
1849                                 rcu_read_unlock();
1850                                 return res;
1851                         }
1852                 }
1853                 rcu_read_unlock();
1854                 return -EINVAL;
1855         }
1856         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1857         rcu_read_unlock();
1858         return res;
1859 }
1860 EXPORT_SYMBOL(ip_route_input_noref);
1861 
1862 /* called with rcu_read_lock() */
1863 static struct rtable *__mkroute_output(const struct fib_result *res,
1864                                        const struct flowi4 *fl4, int orig_oif,
1865                                        struct net_device *dev_out,
1866                                        unsigned int flags)
1867 {
1868         struct fib_info *fi = res->fi;
1869         struct fib_nh_exception *fnhe;
1870         struct in_device *in_dev;
1871         u16 type = res->type;
1872         struct rtable *rth;
1873         bool do_cache;
1874 
1875         in_dev = __in_dev_get_rcu(dev_out);
1876         if (!in_dev)
1877                 return ERR_PTR(-EINVAL);
1878 
1879         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1880                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1881                         return ERR_PTR(-EINVAL);
1882 
1883         if (ipv4_is_lbcast(fl4->daddr))
1884                 type = RTN_BROADCAST;
1885         else if (ipv4_is_multicast(fl4->daddr))
1886                 type = RTN_MULTICAST;
1887         else if (ipv4_is_zeronet(fl4->daddr))
1888                 return ERR_PTR(-EINVAL);
1889 
1890         if (dev_out->flags & IFF_LOOPBACK)
1891                 flags |= RTCF_LOCAL;
1892 
1893         do_cache = true;
1894         if (type == RTN_BROADCAST) {
1895                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1896                 fi = NULL;
1897         } else if (type == RTN_MULTICAST) {
1898                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1899                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1900                                      fl4->flowi4_proto))
1901                         flags &= ~RTCF_LOCAL;
1902                 else
1903                         do_cache = false;
1904                 /* If multicast route do not exist use
1905                  * default one, but do not gateway in this case.
1906                  * Yes, it is hack.
1907                  */
1908                 if (fi && res->prefixlen < 4)
1909                         fi = NULL;
1910         }
1911 
1912         fnhe = NULL;
1913         do_cache &= fi != NULL;
1914         if (do_cache) {
1915                 struct rtable __rcu **prth;
1916                 struct fib_nh *nh = &FIB_RES_NH(*res);
1917 
1918                 fnhe = find_exception(nh, fl4->daddr);
1919                 if (fnhe)
1920                         prth = &fnhe->fnhe_rth_output;
1921                 else {
1922                         if (unlikely(fl4->flowi4_flags &
1923                                      FLOWI_FLAG_KNOWN_NH &&
1924                                      !(nh->nh_gw &&
1925                                        nh->nh_scope == RT_SCOPE_LINK))) {
1926                                 do_cache = false;
1927                                 goto add;
1928                         }
1929                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1930                 }
1931                 rth = rcu_dereference(*prth);
1932                 if (rt_cache_valid(rth)) {
1933                         dst_hold(&rth->dst);
1934                         return rth;
1935                 }
1936         }
1937 
1938 add:
1939         rth = rt_dst_alloc(dev_out,
1940                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1941                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1942                            do_cache);
1943         if (!rth)
1944                 return ERR_PTR(-ENOBUFS);
1945 
1946         rth->dst.output = ip_output;
1947 
1948         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1949         rth->rt_flags   = flags;
1950         rth->rt_type    = type;
1951         rth->rt_is_input = 0;
1952         rth->rt_iif     = orig_oif ? : 0;
1953         rth->rt_pmtu    = 0;
1954         rth->rt_gateway = 0;
1955         rth->rt_uses_gateway = 0;
1956         INIT_LIST_HEAD(&rth->rt_uncached);
1957 
1958         RT_CACHE_STAT_INC(out_slow_tot);
1959 
1960         if (flags & RTCF_LOCAL)
1961                 rth->dst.input = ip_local_deliver;
1962         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1963                 if (flags & RTCF_LOCAL &&
1964                     !(dev_out->flags & IFF_LOOPBACK)) {
1965                         rth->dst.output = ip_mc_output;
1966                         RT_CACHE_STAT_INC(out_slow_mc);
1967                 }
1968 #ifdef CONFIG_IP_MROUTE
1969                 if (type == RTN_MULTICAST) {
1970                         if (IN_DEV_MFORWARD(in_dev) &&
1971                             !ipv4_is_local_multicast(fl4->daddr)) {
1972                                 rth->dst.input = ip_mr_input;
1973                                 rth->dst.output = ip_mc_output;
1974                         }
1975                 }
1976 #endif
1977         }
1978 
1979         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1980 
1981         return rth;
1982 }
1983 
1984 /*
1985  * Major route resolver routine.
1986  */
1987 
1988 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1989 {
1990         struct net_device *dev_out = NULL;
1991         __u8 tos = RT_FL_TOS(fl4);
1992         unsigned int flags = 0;
1993         struct fib_result res;
1994         struct rtable *rth;
1995         int orig_oif;
1996 
1997         res.tclassid    = 0;
1998         res.fi          = NULL;
1999         res.table       = NULL;
2000 
2001         orig_oif = fl4->flowi4_oif;
2002 
2003         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2004         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2005         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2006                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2007 
2008         rcu_read_lock();
2009         if (fl4->saddr) {
2010                 rth = ERR_PTR(-EINVAL);
2011                 if (ipv4_is_multicast(fl4->saddr) ||
2012                     ipv4_is_lbcast(fl4->saddr) ||
2013                     ipv4_is_zeronet(fl4->saddr))
2014                         goto out;
2015 
2016                 /* I removed check for oif == dev_out->oif here.
2017                    It was wrong for two reasons:
2018                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2019                       is assigned to multiple interfaces.
2020                    2. Moreover, we are allowed to send packets with saddr
2021                       of another iface. --ANK
2022                  */
2023 
2024                 if (fl4->flowi4_oif == 0 &&
2025                     (ipv4_is_multicast(fl4->daddr) ||
2026                      ipv4_is_lbcast(fl4->daddr))) {
2027                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2028                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2029                         if (dev_out == NULL)
2030                                 goto out;
2031 
2032                         /* Special hack: user can direct multicasts
2033                            and limited broadcast via necessary interface
2034                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2035                            This hack is not just for fun, it allows
2036                            vic,vat and friends to work.
2037                            They bind socket to loopback, set ttl to zero
2038                            and expect that it will work.
2039                            From the viewpoint of routing cache they are broken,
2040                            because we are not allowed to build multicast path
2041                            with loopback source addr (look, routing cache
2042                            cannot know, that ttl is zero, so that packet
2043                            will not leave this host and route is valid).
2044                            Luckily, this hack is good workaround.
2045                          */
2046 
2047                         fl4->flowi4_oif = dev_out->ifindex;
2048                         goto make_route;
2049                 }
2050 
2051                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2052                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2053                         if (!__ip_dev_find(net, fl4->saddr, false))
2054                                 goto out;
2055                 }
2056         }
2057 
2058 
2059         if (fl4->flowi4_oif) {
2060                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2061                 rth = ERR_PTR(-ENODEV);
2062                 if (dev_out == NULL)
2063                         goto out;
2064 
2065                 /* RACE: Check return value of inet_select_addr instead. */
2066                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2067                         rth = ERR_PTR(-ENETUNREACH);
2068                         goto out;
2069                 }
2070                 if (ipv4_is_local_multicast(fl4->daddr) ||
2071                     ipv4_is_lbcast(fl4->daddr)) {
2072                         if (!fl4->saddr)
2073                                 fl4->saddr = inet_select_addr(dev_out, 0,
2074                                                               RT_SCOPE_LINK);
2075                         goto make_route;
2076                 }
2077                 if (!fl4->saddr) {
2078                         if (ipv4_is_multicast(fl4->daddr))
2079                                 fl4->saddr = inet_select_addr(dev_out, 0,
2080                                                               fl4->flowi4_scope);
2081                         else if (!fl4->daddr)
2082                                 fl4->saddr = inet_select_addr(dev_out, 0,
2083                                                               RT_SCOPE_HOST);
2084                 }
2085         }
2086 
2087         if (!fl4->daddr) {
2088                 fl4->daddr = fl4->saddr;
2089                 if (!fl4->daddr)
2090                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2091                 dev_out = net->loopback_dev;
2092                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2093                 res.type = RTN_LOCAL;
2094                 flags |= RTCF_LOCAL;
2095                 goto make_route;
2096         }
2097 
2098         if (fib_lookup(net, fl4, &res)) {
2099                 res.fi = NULL;
2100                 res.table = NULL;
2101                 if (fl4->flowi4_oif) {
2102                         /* Apparently, routing tables are wrong. Assume,
2103                            that the destination is on link.
2104 
2105                            WHY? DW.
2106                            Because we are allowed to send to iface
2107                            even if it has NO routes and NO assigned
2108                            addresses. When oif is specified, routing
2109                            tables are looked up with only one purpose:
2110                            to catch if destination is gatewayed, rather than
2111                            direct. Moreover, if MSG_DONTROUTE is set,
2112                            we send packet, ignoring both routing tables
2113                            and ifaddr state. --ANK
2114 
2115 
2116                            We could make it even if oif is unknown,
2117                            likely IPv6, but we do not.
2118                          */
2119 
2120                         if (fl4->saddr == 0)
2121                                 fl4->saddr = inet_select_addr(dev_out, 0,
2122                                                               RT_SCOPE_LINK);
2123                         res.type = RTN_UNICAST;
2124                         goto make_route;
2125                 }
2126                 rth = ERR_PTR(-ENETUNREACH);
2127                 goto out;
2128         }
2129 
2130         if (res.type == RTN_LOCAL) {
2131                 if (!fl4->saddr) {
2132                         if (res.fi->fib_prefsrc)
2133                                 fl4->saddr = res.fi->fib_prefsrc;
2134                         else
2135                                 fl4->saddr = fl4->daddr;
2136                 }
2137                 dev_out = net->loopback_dev;
2138                 fl4->flowi4_oif = dev_out->ifindex;
2139                 flags |= RTCF_LOCAL;
2140                 goto make_route;
2141         }
2142 
2143 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2144         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2145                 fib_select_multipath(&res);
2146         else
2147 #endif
2148         if (!res.prefixlen &&
2149             res.table->tb_num_default > 1 &&
2150             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2151                 fib_select_default(&res);
2152 
2153         if (!fl4->saddr)
2154                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2155 
2156         dev_out = FIB_RES_DEV(res);
2157         fl4->flowi4_oif = dev_out->ifindex;
2158 
2159 
2160 make_route:
2161         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2162 
2163 out:
2164         rcu_read_unlock();
2165         return rth;
2166 }
2167 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2168 
2169 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2170 {
2171         return NULL;
2172 }
2173 
2174 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2175 {
2176         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2177 
2178         return mtu ? : dst->dev->mtu;
2179 }
2180 
2181 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2182                                           struct sk_buff *skb, u32 mtu)
2183 {
2184 }
2185 
2186 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2187                                        struct sk_buff *skb)
2188 {
2189 }
2190 
2191 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2192                                           unsigned long old)
2193 {
2194         return NULL;
2195 }
2196 
2197 static struct dst_ops ipv4_dst_blackhole_ops = {
2198         .family                 =       AF_INET,
2199         .protocol               =       cpu_to_be16(ETH_P_IP),
2200         .check                  =       ipv4_blackhole_dst_check,
2201         .mtu                    =       ipv4_blackhole_mtu,
2202         .default_advmss         =       ipv4_default_advmss,
2203         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2204         .redirect               =       ipv4_rt_blackhole_redirect,
2205         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2206         .neigh_lookup           =       ipv4_neigh_lookup,
2207 };
2208 
2209 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2210 {
2211         struct rtable *ort = (struct rtable *) dst_orig;
2212         struct rtable *rt;
2213 
2214         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2215         if (rt) {
2216                 struct dst_entry *new = &rt->dst;
2217 
2218                 new->__use = 1;
2219                 new->input = dst_discard;
2220                 new->output = dst_discard_sk;
2221 
2222                 new->dev = ort->dst.dev;
2223                 if (new->dev)
2224                         dev_hold(new->dev);
2225 
2226                 rt->rt_is_input = ort->rt_is_input;
2227                 rt->rt_iif = ort->rt_iif;
2228                 rt->rt_pmtu = ort->rt_pmtu;
2229 
2230                 rt->rt_genid = rt_genid_ipv4(net);
2231                 rt->rt_flags = ort->rt_flags;
2232                 rt->rt_type = ort->rt_type;
2233                 rt->rt_gateway = ort->rt_gateway;
2234                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2235 
2236                 INIT_LIST_HEAD(&rt->rt_uncached);
2237 
2238                 dst_free(new);
2239         }
2240 
2241         dst_release(dst_orig);
2242 
2243         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2244 }
2245 
2246 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2247                                     struct sock *sk)
2248 {
2249         struct rtable *rt = __ip_route_output_key(net, flp4);
2250 
2251         if (IS_ERR(rt))
2252                 return rt;
2253 
2254         if (flp4->flowi4_proto)
2255                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2256                                                    flowi4_to_flowi(flp4),
2257                                                    sk, 0);
2258 
2259         return rt;
2260 }
2261 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2262 
2263 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2264                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2265                         u32 seq, int event, int nowait, unsigned int flags)
2266 {
2267         struct rtable *rt = skb_rtable(skb);
2268         struct rtmsg *r;
2269         struct nlmsghdr *nlh;
2270         unsigned long expires = 0;
2271         u32 error;
2272         u32 metrics[RTAX_MAX];
2273 
2274         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2275         if (nlh == NULL)
2276                 return -EMSGSIZE;
2277 
2278         r = nlmsg_data(nlh);
2279         r->rtm_family    = AF_INET;
2280         r->rtm_dst_len  = 32;
2281         r->rtm_src_len  = 0;
2282         r->rtm_tos      = fl4->flowi4_tos;
2283         r->rtm_table    = RT_TABLE_MAIN;
2284         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2285                 goto nla_put_failure;
2286         r->rtm_type     = rt->rt_type;
2287         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2288         r->rtm_protocol = RTPROT_UNSPEC;
2289         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2290         if (rt->rt_flags & RTCF_NOTIFY)
2291                 r->rtm_flags |= RTM_F_NOTIFY;
2292 
2293         if (nla_put_be32(skb, RTA_DST, dst))
2294                 goto nla_put_failure;
2295         if (src) {
2296                 r->rtm_src_len = 32;
2297                 if (nla_put_be32(skb, RTA_SRC, src))
2298                         goto nla_put_failure;
2299         }
2300         if (rt->dst.dev &&
2301             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2302                 goto nla_put_failure;
2303 #ifdef CONFIG_IP_ROUTE_CLASSID
2304         if (rt->dst.tclassid &&
2305             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2306                 goto nla_put_failure;
2307 #endif
2308         if (!rt_is_input_route(rt) &&
2309             fl4->saddr != src) {
2310                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2311                         goto nla_put_failure;
2312         }
2313         if (rt->rt_uses_gateway &&
2314             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2315                 goto nla_put_failure;
2316 
2317         expires = rt->dst.expires;
2318         if (expires) {
2319                 unsigned long now = jiffies;
2320 
2321                 if (time_before(now, expires))
2322                         expires -= now;
2323                 else
2324                         expires = 0;
2325         }
2326 
2327         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2328         if (rt->rt_pmtu && expires)
2329                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2330         if (rtnetlink_put_metrics(skb, metrics) < 0)
2331                 goto nla_put_failure;
2332 
2333         if (fl4->flowi4_mark &&
2334             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2335                 goto nla_put_failure;
2336 
2337         error = rt->dst.error;
2338 
2339         if (rt_is_input_route(rt)) {
2340 #ifdef CONFIG_IP_MROUTE
2341                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2342                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2343                         int err = ipmr_get_route(net, skb,
2344                                                  fl4->saddr, fl4->daddr,
2345                                                  r, nowait);
2346                         if (err <= 0) {
2347                                 if (!nowait) {
2348                                         if (err == 0)
2349                                                 return 0;
2350                                         goto nla_put_failure;
2351                                 } else {
2352                                         if (err == -EMSGSIZE)
2353                                                 goto nla_put_failure;
2354                                         error = err;
2355                                 }
2356                         }
2357                 } else
2358 #endif
2359                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2360                                 goto nla_put_failure;
2361         }
2362 
2363         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2364                 goto nla_put_failure;
2365 
2366         return nlmsg_end(skb, nlh);
2367 
2368 nla_put_failure:
2369         nlmsg_cancel(skb, nlh);
2370         return -EMSGSIZE;
2371 }
2372 
2373 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2374 {
2375         struct net *net = sock_net(in_skb->sk);
2376         struct rtmsg *rtm;
2377         struct nlattr *tb[RTA_MAX+1];
2378         struct rtable *rt = NULL;
2379         struct flowi4 fl4;
2380         __be32 dst = 0;
2381         __be32 src = 0;
2382         u32 iif;
2383         int err;
2384         int mark;
2385         struct sk_buff *skb;
2386 
2387         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2388         if (err < 0)
2389                 goto errout;
2390 
2391         rtm = nlmsg_data(nlh);
2392 
2393         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2394         if (skb == NULL) {
2395                 err = -ENOBUFS;
2396                 goto errout;
2397         }
2398 
2399         /* Reserve room for dummy headers, this skb can pass
2400            through good chunk of routing engine.
2401          */
2402         skb_reset_mac_header(skb);
2403         skb_reset_network_header(skb);
2404 
2405         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2406         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2407         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2408 
2409         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2410         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2411         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2412         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2413 
2414         memset(&fl4, 0, sizeof(fl4));
2415         fl4.daddr = dst;
2416         fl4.saddr = src;
2417         fl4.flowi4_tos = rtm->rtm_tos;
2418         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2419         fl4.flowi4_mark = mark;
2420 
2421         if (iif) {
2422                 struct net_device *dev;
2423 
2424                 dev = __dev_get_by_index(net, iif);
2425                 if (dev == NULL) {
2426                         err = -ENODEV;
2427                         goto errout_free;
2428                 }
2429 
2430                 skb->protocol   = htons(ETH_P_IP);
2431                 skb->dev        = dev;
2432                 skb->mark       = mark;
2433                 local_bh_disable();
2434                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2435                 local_bh_enable();
2436 
2437                 rt = skb_rtable(skb);
2438                 if (err == 0 && rt->dst.error)
2439                         err = -rt->dst.error;
2440         } else {
2441                 rt = ip_route_output_key(net, &fl4);
2442 
2443                 err = 0;
2444                 if (IS_ERR(rt))
2445                         err = PTR_ERR(rt);
2446         }
2447 
2448         if (err)
2449                 goto errout_free;
2450 
2451         skb_dst_set(skb, &rt->dst);
2452         if (rtm->rtm_flags & RTM_F_NOTIFY)
2453                 rt->rt_flags |= RTCF_NOTIFY;
2454 
2455         err = rt_fill_info(net, dst, src, &fl4, skb,
2456                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2457                            RTM_NEWROUTE, 0, 0);
2458         if (err <= 0)
2459                 goto errout_free;
2460 
2461         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2462 errout:
2463         return err;
2464 
2465 errout_free:
2466         kfree_skb(skb);
2467         goto errout;
2468 }
2469 
2470 void ip_rt_multicast_event(struct in_device *in_dev)
2471 {
2472         rt_cache_flush(dev_net(in_dev->dev));
2473 }
2474 
2475 #ifdef CONFIG_SYSCTL
2476 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2477 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2478 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2479 static int ip_rt_gc_elasticity __read_mostly    = 8;
2480 
2481 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2482                                         void __user *buffer,
2483                                         size_t *lenp, loff_t *ppos)
2484 {
2485         struct net *net = (struct net *)__ctl->extra1;
2486 
2487         if (write) {
2488                 rt_cache_flush(net);
2489                 fnhe_genid_bump(net);
2490                 return 0;
2491         }
2492 
2493         return -EINVAL;
2494 }
2495 
2496 static struct ctl_table ipv4_route_table[] = {
2497         {
2498                 .procname       = "gc_thresh",
2499                 .data           = &ipv4_dst_ops.gc_thresh,
2500                 .maxlen         = sizeof(int),
2501                 .mode           = 0644,
2502                 .proc_handler   = proc_dointvec,
2503         },
2504         {
2505                 .procname       = "max_size",
2506                 .data           = &ip_rt_max_size,
2507                 .maxlen         = sizeof(int),
2508                 .mode           = 0644,
2509                 .proc_handler   = proc_dointvec,
2510         },
2511         {
2512                 /*  Deprecated. Use gc_min_interval_ms */
2513 
2514                 .procname       = "gc_min_interval",
2515                 .data           = &ip_rt_gc_min_interval,
2516                 .maxlen         = sizeof(int),
2517                 .mode           = 0644,
2518                 .proc_handler   = proc_dointvec_jiffies,
2519         },
2520         {
2521                 .procname       = "gc_min_interval_ms",
2522                 .data           = &ip_rt_gc_min_interval,
2523                 .maxlen         = sizeof(int),
2524                 .mode           = 0644,
2525                 .proc_handler   = proc_dointvec_ms_jiffies,
2526         },
2527         {
2528                 .procname       = "gc_timeout",
2529                 .data           = &ip_rt_gc_timeout,
2530                 .maxlen         = sizeof(int),
2531                 .mode           = 0644,
2532                 .proc_handler   = proc_dointvec_jiffies,
2533         },
2534         {
2535                 .procname       = "gc_interval",
2536                 .data           = &ip_rt_gc_interval,
2537                 .maxlen         = sizeof(int),
2538                 .mode           = 0644,
2539                 .proc_handler   = proc_dointvec_jiffies,
2540         },
2541         {
2542                 .procname       = "redirect_load",
2543                 .data           = &ip_rt_redirect_load,
2544                 .maxlen         = sizeof(int),
2545                 .mode           = 0644,
2546                 .proc_handler   = proc_dointvec,
2547         },
2548         {
2549                 .procname       = "redirect_number",
2550                 .data           = &ip_rt_redirect_number,
2551                 .maxlen         = sizeof(int),
2552                 .mode           = 0644,
2553                 .proc_handler   = proc_dointvec,
2554         },
2555         {
2556                 .procname       = "redirect_silence",
2557                 .data           = &ip_rt_redirect_silence,
2558                 .maxlen         = sizeof(int),
2559                 .mode           = 0644,
2560                 .proc_handler   = proc_dointvec,
2561         },
2562         {
2563                 .procname       = "error_cost",
2564                 .data           = &ip_rt_error_cost,
2565                 .maxlen         = sizeof(int),
2566                 .mode           = 0644,
2567                 .proc_handler   = proc_dointvec,
2568         },
2569         {
2570                 .procname       = "error_burst",
2571                 .data           = &ip_rt_error_burst,
2572                 .maxlen         = sizeof(int),
2573                 .mode           = 0644,
2574                 .proc_handler   = proc_dointvec,
2575         },
2576         {
2577                 .procname       = "gc_elasticity",
2578                 .data           = &ip_rt_gc_elasticity,
2579                 .maxlen         = sizeof(int),
2580                 .mode           = 0644,
2581                 .proc_handler   = proc_dointvec,
2582         },
2583         {
2584                 .procname       = "mtu_expires",
2585                 .data           = &ip_rt_mtu_expires,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = proc_dointvec_jiffies,
2589         },
2590         {
2591                 .procname       = "min_pmtu",
2592                 .data           = &ip_rt_min_pmtu,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = proc_dointvec,
2596         },
2597         {
2598                 .procname       = "min_adv_mss",
2599                 .data           = &ip_rt_min_advmss,
2600                 .maxlen         = sizeof(int),
2601                 .mode           = 0644,
2602                 .proc_handler   = proc_dointvec,
2603         },
2604         { }
2605 };
2606 
2607 static struct ctl_table ipv4_route_flush_table[] = {
2608         {
2609                 .procname       = "flush",
2610                 .maxlen         = sizeof(int),
2611                 .mode           = 0200,
2612                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2613         },
2614         { },
2615 };
2616 
2617 static __net_init int sysctl_route_net_init(struct net *net)
2618 {
2619         struct ctl_table *tbl;
2620 
2621         tbl = ipv4_route_flush_table;
2622         if (!net_eq(net, &init_net)) {
2623                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2624                 if (tbl == NULL)
2625                         goto err_dup;
2626 
2627                 /* Don't export sysctls to unprivileged users */
2628                 if (net->user_ns != &init_user_ns)
2629                         tbl[0].procname = NULL;
2630         }
2631         tbl[0].extra1 = net;
2632 
2633         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2634         if (net->ipv4.route_hdr == NULL)
2635                 goto err_reg;
2636         return 0;
2637 
2638 err_reg:
2639         if (tbl != ipv4_route_flush_table)
2640                 kfree(tbl);
2641 err_dup:
2642         return -ENOMEM;
2643 }
2644 
2645 static __net_exit void sysctl_route_net_exit(struct net *net)
2646 {
2647         struct ctl_table *tbl;
2648 
2649         tbl = net->ipv4.route_hdr->ctl_table_arg;
2650         unregister_net_sysctl_table(net->ipv4.route_hdr);
2651         BUG_ON(tbl == ipv4_route_flush_table);
2652         kfree(tbl);
2653 }
2654 
2655 static __net_initdata struct pernet_operations sysctl_route_ops = {
2656         .init = sysctl_route_net_init,
2657         .exit = sysctl_route_net_exit,
2658 };
2659 #endif
2660 
2661 static __net_init int rt_genid_init(struct net *net)
2662 {
2663         atomic_set(&net->ipv4.rt_genid, 0);
2664         atomic_set(&net->fnhe_genid, 0);
2665         get_random_bytes(&net->ipv4.dev_addr_genid,
2666                          sizeof(net->ipv4.dev_addr_genid));
2667         return 0;
2668 }
2669 
2670 static __net_initdata struct pernet_operations rt_genid_ops = {
2671         .init = rt_genid_init,
2672 };
2673 
2674 static int __net_init ipv4_inetpeer_init(struct net *net)
2675 {
2676         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2677 
2678         if (!bp)
2679                 return -ENOMEM;
2680         inet_peer_base_init(bp);
2681         net->ipv4.peers = bp;
2682         return 0;
2683 }
2684 
2685 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2686 {
2687         struct inet_peer_base *bp = net->ipv4.peers;
2688 
2689         net->ipv4.peers = NULL;
2690         inetpeer_invalidate_tree(bp);
2691         kfree(bp);
2692 }
2693 
2694 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2695         .init   =       ipv4_inetpeer_init,
2696         .exit   =       ipv4_inetpeer_exit,
2697 };
2698 
2699 #ifdef CONFIG_IP_ROUTE_CLASSID
2700 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2701 #endif /* CONFIG_IP_ROUTE_CLASSID */
2702 
2703 int __init ip_rt_init(void)
2704 {
2705         int rc = 0;
2706 
2707 #ifdef CONFIG_IP_ROUTE_CLASSID
2708         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2709         if (!ip_rt_acct)
2710                 panic("IP: failed to allocate ip_rt_acct\n");
2711 #endif
2712 
2713         ipv4_dst_ops.kmem_cachep =
2714                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2715                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2716 
2717         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2718 
2719         if (dst_entries_init(&ipv4_dst_ops) < 0)
2720                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2721 
2722         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2723                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2724 
2725         ipv4_dst_ops.gc_thresh = ~0;
2726         ip_rt_max_size = INT_MAX;
2727 
2728         devinet_init();
2729         ip_fib_init();
2730 
2731         if (ip_rt_proc_init())
2732                 pr_err("Unable to create route proc files\n");
2733 #ifdef CONFIG_XFRM
2734         xfrm_init();
2735         xfrm4_init();
2736 #endif
2737         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2738 
2739 #ifdef CONFIG_SYSCTL
2740         register_pernet_subsys(&sysctl_route_ops);
2741 #endif
2742         register_pernet_subsys(&rt_genid_ops);
2743         register_pernet_subsys(&ipv4_inetpeer_ops);
2744         return rc;
2745 }
2746 
2747 #ifdef CONFIG_SYSCTL
2748 /*
2749  * We really need to sanitize the damn ipv4 init order, then all
2750  * this nonsense will go away.
2751  */
2752 void __init ip_static_sysctl_init(void)
2753 {
2754         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2755 }
2756 #endif
2757 

This page was automatically generated by LXR 0.3.1 (source).  •  Linux is a registered trademark of Linus Torvalds  •  Contact us