kernel_init { //初始化各种init函数 kernel_init_freeable(); ->do_basic_setup ->driver_init(); -->do_initcalls(); //这里会通过依赖编译器的方式,寻找到内核中所有的_init 表识的函数,如sock_init,inet_init函数等 ‘staticint __init sock_init(void)’ 从而能初始化各种init函数,这种机制,涉及到ld链接脚本,gcc编译器本身的机制,以及elf文件结构,内容较多,等有机会再具体写写 这里截取一段:通过objdump vmlinux后看到的能找到的init函数 ffffffff820a48e8 l O .init.data 0000000000000008 __initcall_sock_init1 关键函数1 ffffffff820a4e18 l O .init.data 0000000000000008 __initcall_proto_init4 ffffffff820a48f0 l O .init.data 0000000000000008 __initcall_net_inuse_init1 ffffffff820a4788 l O .init.data 0000000000000008 __initcall_net_ns_init0 ffffffff820a48f8 l O .init.data 0000000000000008 __initcall_net_defaults_init1 ffffffff820a4900 l O .init.data 0000000000000008 __initcall_init_default_flow_dissectors1 ffffffff820a5020 l O .init.data 0000000000000008 __initcall_sysctl_core_init5 ffffffff820a4e20 l O .init.data 0000000000000008 __initcall_net_dev_init4 关键函数2 ffffffff820a4e28 l O .init.data 0000000000000008 __initcall_neigh_init4 ffffffff820a5030 l O .init.data 0000000000000008 __initcall_inet_init5 关键函数3 //拉取init进程 if(execute_command){ ret = run_init_process(execute_command); if (!ret) return0; panic("Requested init %s failed (error %d).", execute_command, ret); } if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || !try_to_run_init_process("/bin/sh")) return0; } } }
staticint __init sock_init(void) { int err; /* * Initialize the network sysctl infrastructure. */ err = net_sysctl_init(); //初始化网络相关的/proc/sys/下的目录: /* /* Avoid limitations in the sysctl implementation by * registering "/proc/sys/net" as an empty directory not in a * network namespace. net_header = register_sysctl("net", empty); */ //初始化网络空间相关操作等,网络空间用于docker时,每个docker容器之间的网络隔离,类似的还有文件系统隔离等带来的文件系统空间等等; //ret = register_pernet_subsys(&sysctl_pernet_ops); //register_sysctl_root(&net_sysctl_root); if (err) goto out;
intsocket(int domain, int type, int protocol); domain: 指定协议簇,ipv4/ipv6.. AF_INET IPv4 Internet protocols
type: 指定udp/tcp SOCK_STREAM Provides sequenced, reliable, two-way, connection-based byte streams. An out-of-band data transmission mechanism may be supported. SOCK_DGRAM Supports datagrams(connectionless, unreliable messages of a fixed maximum length). protocol: 通常只存在一个协议来支持 给定协议族中的特定套接字类型,在这种情况下,protocol可以指定为0。
rc = proto_register(&udp_prot, 1); if (rc) goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1); if (rc) goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1); if (rc) goto out_unregister_raw_proto;
/* * Tell SOCKET that we are alive... */ (void)sock_register(&inet_family_ops); /* //将inet_family_ops注册到地址簇列表中 (void)sock_register(&inet_family_ops); rcu_assign_pointer(net_families[ops->family], ops); //其实就是把ops放到对应的表net_family中,这个是全局变量,下面会解释 */ #ifdef CONFIG_SYSCTL ip_static_sysctl_init(); #endif
/* * Add all the base protocols. */ //下面填充 inet_protos[protocol]=struct net_protocol结构; if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif //初始化和注册 inetsw[] /* Register the socket-side information for inet_create. */ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); /# 这个函数展开看看,主要是将inetws_array中的元素添加到全局静态链表中inetsw,注意linux特殊的链表连接方式,是结构中的成员为一个结点; voidinet_register_protosw(struct inet_protosw *p) { structlist_head *lh; structinet_protosw *answer; int protocol = p->protocol; structlist_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ last_perm = &inetsw[p->type];//取出结构 list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) break; if (protocol == answer->protocol) goto out_permanent; last_perm = lh; } /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm);//将p->list加到取出的结构中 ... } #/ /* * Set the ARP module up */ //初始化arp和arp_packet_type注册:dev_add_pack(&arp_packet_type); arp_init();
/* * Set the IP module up */ //ip路由表初始化 ip_init(); //tcp hashinfo相关初始化 tcp_v4_init();
/* Setup TCP slab cache for open requests. */ tcp_init();
staticstructlist_head inetsw[SOCK_MAX]; /* This is used to register socket interfaces for IP protocols. */ structinet_protosw { structlist_head list;
/* These two fields form the lookup key. */ unsignedshort type; /* This is the 2nd argument to socket(2). */ unsignedshort protocol; /* This is the L4 protocol number. */
/* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ staticstructinet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, },
/* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ staticint __init net_dev_init(void) { int i, rc = -ENOMEM;
BUG_ON(!dev_boot_phase);
if (dev_proc_init())//在/proc/net目录下创建四个proc条目(分别为dev、softnet_stat、ptype和wireless) goto out;
if (netdev_kobject_init())//暂时不太清除 goto out;
INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]);//前面有提到,不赘述
INIT_LIST_HEAD(&offload_base);() if (register_pernet_subsys(&netdev_net_ops))//将全局变量netdev_net_ops注册到链表(static struct list_head *first_device = &pernet_list;)上 goto out;
/* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ //注册网络命令空间设备,确保loopback设备在所有网络设备中最先出现和最后消失 if (register_pernet_device(&loopback_net_ops)) goto out;
think@think-VirtualBox:~/source_linux/linux-lts-xenial-4.4.0$ objdump -t vmlinux| grep loopback 0000000000000000 l df *ABS* 0000000000000000 loopback.c ffffffff815ec3a0 l F .text 00000000000000a8 loopback_setup ffffffff81aa91c0 l O .rodata 0000000000000198 loopback_ethtool_ops ffffffff81aa8f80 l O .rodata 0000000000000238 loopback_ops ffffffff815ec450 l F .text 0000000000000036 loopback_dev_free ffffffff815ec490 l F .text 0000000000000081 loopback_get_stats64 ffffffff815ec520 l F .text 000000000000009e loopback_xmit ffffffff815ec5c0 l F .text 000000000000007d loopback_dev_init ffffffff815ec640 l F .text 000000000000009d loopback_net_init //这个函数
/* * snull.c -- the Simple Network Utility * * Copyright (C) 2001 Alessandro Rubini and Jonathan Corbet * Copyright (C) 2001 O'Reilly & Associates * * The source code in this file can be freely used, adapted, * and redistributed in source or binary form, so long as an * acknowledgment appears in derived source files. The citation * should list that the code comes from the book "Linux Device * Drivers" by Alessandro Rubini and Jonathan Corbet, published * by O'Reilly & Associates. No warranty is attached; * we cannot take responsibility for errors or fitness for use. * * $Id: snull.c,v 1.21 2004/11/05 02:36:03 rubini Exp $ */
/* * Do we run in NAPI mode? */ staticint use_napi = 0; module_param(use_napi, int, 0);
/* * A structure representing an in-flight packet. */ structsnull_packet { structsnull_packet *next; structnet_device *dev; int datalen; u8 data[ETH_DATA_LEN]; };
int pool_size = 8; module_param(pool_size, int, 0);
/* * This structure is private to each device. It is used to pass * packets in and out, so there is place for a packet */
structsnull_priv {//这个网络设备的私有数据结构 structnet_device_stats stats; int status; structsnull_packet *ppool; structsnull_packet *rx_queue; /* List of incoming packets */ int rx_int_enabled; int tx_packetlen; u8 *tx_packetdata; structsk_buff *skb; spinlock_t lock; }; //一些net_deivce需要的函数 staticvoidsnull_tx_timeout(struct net_device *dev); staticvoid(*snull_interrupt)(int, void *, struct pt_regs *);
/* * Set up a device's packet pool. */ voidsnull_setup_pool(struct net_device *dev) { structsnull_priv *priv = netdev_priv(dev); int i; structsnull_packet *pkt;
priv->ppool = NULL; for (i = 0; i < pool_size; i++) { pkt = kmalloc (sizeof (struct snull_packet), GFP_KERNEL); if (pkt == NULL) { printk (KERN_NOTICE "Ran out of memory allocating packet pool\n"); return; } pkt->dev = dev; pkt->next = priv->ppool; priv->ppool = pkt; } }
/* * Assign the hardware address of the board: use "\0SNULx", where * x is 0 or 1. The first byte is '\0' to avoid being a multicast * address (the first byte of multicast addrs is odd). */ memcpy(dev->dev_addr, "\0SNUL0", ETH_ALEN); if (dev == snull_devs[1]) dev->dev_addr[ETH_ALEN-1]++; /* \0SNUL1 */ netif_start_queue(dev); return0; }
intsnull_release(struct net_device *dev) { /* release ports, irq and such -- like fops->close */
netif_stop_queue(dev); /* can't transmit any more */ return0; }
/* * Configuration changes (passed on by ifconfig) */ intsnull_config(struct net_device *dev, struct ifmap *map) { if (dev->flags & IFF_UP) /* can't act on a running interface */ return -EBUSY;
/* Allow changing the IRQ */ if (map->irq != dev->irq) { dev->irq = map->irq; /* request_irq() is delayed to open-time */ }
/* ignore other fields */ return0; }
/* * Receive a packet: retrieve, encapsulate and pass over to upper levels */ voidsnull_rx(struct net_device *dev, struct snull_packet *pkt) { structsk_buff *skb; structsnull_priv *priv = netdev_priv(dev);
/* * The packet has been retrieved from the transmission * medium. Build an skb around it, so upper layers can handle it */ skb = dev_alloc_skb(pkt->datalen + 2); if (!skb) { if (printk_ratelimit()) printk(KERN_NOTICE "snull rx: low on mem - packet dropped\n"); priv->stats.rx_dropped++; goto out; } skb_reserve(skb, 2); /* align IP on 16B boundary */ memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen);
/* Write metadata, and then pass to the receive level */ skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */ priv->stats.rx_packets++; priv->stats.rx_bytes += pkt->datalen; netif_rx(skb); out: return; }
/* * The poll implementation. */ staticintsnull_poll(struct net_device *dev, int *budget) { int npackets = 0, quota = min(dev->quota, *budget); structsk_buff *skb; structsnull_priv *priv = netdev_priv(dev); structsnull_packet *pkt; while (npackets < quota && priv->rx_queue) { pkt = snull_dequeue_buf(dev); skb = dev_alloc_skb(pkt->datalen + 2); if (! skb) { if (printk_ratelimit()) printk(KERN_NOTICE "snull: packet dropped\n"); priv->stats.rx_dropped++; snull_release_buffer(pkt); continue; } skb_reserve(skb, 2); /* align IP on 16B boundary */ memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen); skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */ netif_receive_skb(skb); /* Maintain stats */ npackets++; priv->stats.rx_packets++; priv->stats.rx_bytes += pkt->datalen; snull_release_buffer(pkt); } /* If we processed all packets, we're done; tell the kernel and reenable ints */ *budget -= npackets; dev->quota -= npackets; if (! priv->rx_queue) { netif_rx_complete(dev); snull_rx_ints(dev, 1); return0; } /* We couldn't process everything. */ return1; } /* * The typical interrupt entry point */ staticvoidsnull_regular_interrupt(int irq, void *dev_id, struct pt_regs *regs) { int statusword; structsnull_priv *priv; structsnull_packet *pkt = NULL; /* * As usual, check the "device" pointer to be sure it is * really interrupting. * Then assign "struct device *dev" */ structnet_device *dev = (struct net_device *)dev_id; /* ... and check with hw if it's really ours */
/* paranoid */ if (!dev) return;
/* Lock the device */ priv = netdev_priv(dev); spin_lock(&priv->lock);
/* retrieve statusword: real netdevices use I/O instructions */ statusword = priv->status; priv->status = 0; if (statusword & SNULL_RX_INTR) { /* send it to snull_rx for handling */ pkt = priv->rx_queue; if (pkt) { priv->rx_queue = pkt->next; snull_rx(dev, pkt); } } if (statusword & SNULL_TX_INTR) { /* a transmission is over: free the skb */ priv->stats.tx_packets++; priv->stats.tx_bytes += priv->tx_packetlen; dev_kfree_skb(priv->skb); }
/* Unlock the device and we are done */ spin_unlock(&priv->lock); if (pkt) snull_release_buffer(pkt); /* Do this outside the lock! */ return; }
/* * A NAPI interrupt handler. */ staticvoidsnull_napi_interrupt(int irq, void *dev_id, struct pt_regs *regs) { int statusword; structsnull_priv *priv;
/* * As usual, check the "device" pointer for shared handlers. * Then assign "struct device *dev" */ structnet_device *dev = (struct net_device *)dev_id; /* ... and check with hw if it's really ours */
/* paranoid */ if (!dev) return;
/* Lock the device */ priv = netdev_priv(dev); spin_lock(&priv->lock);
/* retrieve statusword: real netdevices use I/O instructions */ statusword = priv->status; priv->status = 0; if (statusword & SNULL_RX_INTR) { snull_rx_ints(dev, 0); /* Disable further interrupts */ netif_rx_schedule(dev); } if (statusword & SNULL_TX_INTR) { /* a transmission is over: free the skb */ priv->stats.tx_packets++; priv->stats.tx_bytes += priv->tx_packetlen; dev_kfree_skb(priv->skb); }
/* Unlock the device and we are done */ spin_unlock(&priv->lock); return; }
/* * Transmit a packet (low level interface) */ staticvoidsnull_hw_tx(char *buf, int len, struct net_device *dev) { /* * This function deals with hw details. This interface loops * back the packet to the other snull interface (if any). * In other words, this function implements the snull behaviour, * while all other procedures are rather device-independent */ structiphdr *ih; structnet_device *dest; structsnull_priv *priv; u32 *saddr, *daddr; structsnull_packet *tx_buffer; /* I am paranoid. Ain't I? */ if (len < sizeof(struct ethhdr) + sizeof(struct iphdr)) { printk("snull: Hmm... packet too short (%i octets)\n", len); return; }
if (0) { /* enable this conditional to look at the data */ int i; PDEBUG("len is %i\n" KERN_DEBUG "data:",len); for (i=14 ; i<len; i++) printk(" %02x",buf[i]&0xff); printk("\n"); } /* * Ethhdr is 14 bytes, but the kernel arranges for iphdr * to be aligned (i.e., ethhdr is unaligned) */ ih = (struct iphdr *)(buf+sizeof(struct ethhdr)); saddr = &ih->saddr; daddr = &ih->daddr;
((u8 *)saddr)[2] ^= 1; /* change the third octet (class C) */ ((u8 *)daddr)[2] ^= 1;
ih->check = 0; /* and rebuild the checksum (ip needs it) */ ih->check = ip_fast_csum((unsignedchar *)ih,ih->ihl);
/* * Ok, now the packet is ready for transmission: first simulate a * receive interrupt on the twin device, then a * transmission-done on the transmitting device */ dest = snull_devs[dev == snull_devs[0] ? 1 : 0]; priv = netdev_priv(dest); tx_buffer = snull_get_tx_buffer(dev); tx_buffer->datalen = len; memcpy(tx_buffer->data, buf, len); snull_enqueue_buf(dest, tx_buffer); if (priv->rx_int_enabled) { priv->status |= SNULL_RX_INTR; snull_interrupt(0, dest, NULL); }
/* * Return statistics to the caller */ structnet_device_stats *snull_stats(struct net_device *dev) { structsnull_priv *priv = netdev_priv(dev); return &priv->stats; }
/* * This function is called to fill up an eth header, since arp is not * available on the interface */ intsnull_rebuild_header(struct sk_buff *skb) { structethhdr *eth = (struct ethhdr *) skb->data; structnet_device *dev = skb->dev; memcpy(eth->h_source, dev->dev_addr, dev->addr_len); memcpy(eth->h_dest, dev->dev_addr, dev->addr_len); eth->h_dest[ETH_ALEN-1] ^= 0x01; /* dest is us xor 1 */ return0; }
eth->h_proto = htons(type); memcpy(eth->h_source, saddr ? saddr : dev->dev_addr, dev->addr_len); memcpy(eth->h_dest, daddr ? daddr : dev->dev_addr, dev->addr_len); eth->h_dest[ETH_ALEN-1] ^= 0x01; /* dest is us xor 1 */ return (dev->hard_header_len); }
/* * The "change_mtu" method is usually not needed. * If you need it, it must be like this. */ intsnull_change_mtu(struct net_device *dev, int new_mtu) { unsignedlong flags; structsnull_priv *priv = netdev_priv(dev); spinlock_t *lock = &priv->lock; /* check ranges */ if ((new_mtu < 68) || (new_mtu > 1500)) return -EINVAL; /* * Do anything you need, and the accept the value */ spin_lock_irqsave(lock, flags); dev->mtu = new_mtu; spin_unlock_irqrestore(lock, flags); return0; /* success */ }
/* * The init function (sometimes called probe). * It is invoked by register_netdev() */ //这个是setup函数,用于初始化net_device部分结构,并作为alloc_netdev的第三个参数传入 voidsnull_init(struct net_device *dev) { structsnull_priv *priv; #if 0 /* * Make the usual checks: check_region(), probe irq, ... -ENODEV * should be returned if no device found. No resource should be * grabbed: this is done on open(). */ #endif
/* * Then, assign other fields in dev, using ether_setup() and some * hand assignments */ ether_setup(dev); /* assign some of the fields */
ret = -ENODEV; for (i = 0; i < 2; i++) if ((result = register_netdev(snull_devs[i])))//注册net_device printk("snull: error %i registering device \"%s\"\n", result, snull_devs[i]->name); else ret = 0; out: if (ret) snull_cleanup(); return ret; }