linux skb发送到设备
即 skb从协议栈出来,到设备驱动程序的过程;
简介:传输过程和接收过程是对称的,也是初始化相关软中断,和对应的net_tx_action:
帧传输数据通路的主要任务:
- 为设备开启和关闭帧的传输(驱动程序中的任务)
- 为设备调度以准备传输;
- 为下一帧调度以准备传输,也就是在设备出口队列中等待的那些帧;
- 传输本身(涉及到io端口操作)
初始化软中断:
1
2
3NET_TX_SOFTIRQ:
dev.c:net_dev_init:
open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
几个结构:
- poll_list对应的是接收的设备列表, output_queue 是传输的设备列表;
1_2:struct Qdisc *q;//设备的出口队列,每个设备都会有一个 - 只有开启的设备(__LINK_STATE_START标识的设备)才能接收调度以准备接收,同样,只有开启传输功能的设备(__LINK_STATE_XOFF标示清除的设备)才能接受调度并准备传输;
- 当设备接收调度准备接收时,其__LINK_STATE_RX_SCHED标识会设置,当设备接收调度准备传输时,其__LINK_STATE_SCHED标识会设置;
开始传输
当上层的APP试图建立一个TCP的链接,或者发送一个封包的时候,在kernel的协议栈部分,在TCP/UDP层会组成一个网络的封包,然后通过IP进行路由选择以及iptables的Hook,之后 到neighbor层查询或者询问下一跳的链路层地址,然后通过调用 dev_queue_xmit 这个网络设备接口层函数发送给driver
所以从 dev_queue_xmit 这个函数开始分析起来;
- dev_queue_xmit
从设备的出口队列(Qdisc)中退出一帧,然后将该帧传递给设备的hard_start_xmit方法;
但是这个函数可能因为各种原因无法传输,如该设备的出口队列已关闭,或者该设备队列的锁被取走;为处理后一种情况,内核提供了一个函数:__netif_schedule
下面是某个版本dev_queue_xmit,不同版本大同小异
关键点:
- tc qdisc队列处理,这里根据过滤条件需要,如进行丢包,流量整形等。
- 队列出来后,发送:dev_hard_start_xmit(skb, dev, txq, &ret);
- 判断是否设备发送完成:dev_xmit_complete,是则清理相关缓存,否则,启动__netif_schedule,会
raise_softirq_irqoff(NET_TX_SOFTIRQ); 启动软中断 调用到net_tx_action为什么上述的,要启动__netif_schedule 并启动软中断呢?1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*/
int dev_queue_xmit(struct sk_buff *skb)//从上层传来skb结构,携带要发送的数据包;
{
struct net_device *dev = skb->dev;//从skb中取得net_device结构,以标识外出设备;skb->data则指向有效载荷的开头长度为skb->len
struct Qdisc *q;//
int rc = -ENOMEM;
//检查帧是否由一些片段组成,以及设备是否可以通过散播/聚集dma方式处理这些片段
if (skb_shinfo(skb)->frag_list &&
!(dev->features & NETIF_F_FRAGLIST) &&
__skb_linearize(skb, GFP_ATOMIC))
goto out_kfree_skb;
/* Fragmented skb is linearized if device does not support SG,
* or if at least one of fragments is in highmem and device
* does not support DMA from it.
*/
if (skb_shinfo(skb)->nr_frags &&
(!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
__skb_linearize(skb, GFP_ATOMIC))
goto out_kfree_skb;
//计算校验和
/* If packet is not checksummed and device does not support
* checksumming for this protocol, complete checksumming here.
*/
if (skb->ip_summed == CHECKSUM_HW &&
(!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
(!(dev->features & NETIF_F_IP_CSUM) ||
skb->protocol != htons(ETH_P_IP)))) {
if ((skb = skb_checksum_help(skb)) == NULL)
goto out;
}
/* Grab device queue *///有队列设备:设备的队列规则存在;通过dev->qdisc访问
spin_lock_bh(&dev->queue_lock);//需要先获得对应的锁
q = dev->qdisc; //取得设备的出口队列
if (q->enqueue) {
rc = q->enqueue(skb, q);//将一个元素添加到队列中(dequeue:从队列中取出一个元素;requeue:将一个原先已提取的元素放回队列中(如传输失败)
qdisc_run(dev);//选出下一个要传输的帧,会间接调用相关联的队列规则的dequeue函数(后面分析)
spin_unlock_bh(&dev->queue_lock);
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
goto out;
}
//无队列设备,走以下通道:
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
Really, it is unlikely that xmit_lock protection is necessary here.
(f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id();
if (dev->xmit_lock_owner != cpu) {
/*
* The spin_lock effectivly does a preempt lock, but
* we are about to drop that...
*/
preempt_disable();
spin_unlock(&dev->queue_lock);
spin_lock(&dev->xmit_lock);
dev->xmit_lock_owner = cpu;
preempt_enable();
if (!netif_queue_stopped(dev)) {
if (netdev_nit)//拷贝副本到嗅探器
dev_queue_xmit_nit(skb, dev);
rc = 0;
if (!dev->hard_start_xmit(skb, dev)) {//可能是回环设备,直接调用netif_rx
dev->xmit_lock_owner = -1;
spin_unlock_bh(&dev->xmit_lock);
goto out;
}
}
dev->xmit_lock_owner = -1;
spin_unlock_bh(&dev->xmit_lock);
if (net_ratelimit())
printk(KERN_CRIT "Virtual device %s asks to "
"queue packet!\n", dev->name);
goto out_enetdown;
} else {
/* Recursion is detected! It is possible,
* unfortunately */
if (net_ratelimit())
printk(KERN_CRIT "Dead loop on virtual device "
"%s, fix it urgently!\n", dev->name);
}
}
spin_unlock_bh(&dev->queue_lock);
out_enetdown:
rc = -ENETDOWN;
out_kfree_skb:
kfree_skb(skb);
out:
return rc;
}
__netif_schedule主要完成两个任务: - 将设备添加到output_queue列表的头部,此列表是接收时所用的与poll_list配对的列表;每个cpu都有一个output_queue,一个output_queue是一个设备列表,包含多个设备;output_queue会由napi以及非napi设备所用;output_queue中的设备是以net_device->next_sched指针连接在一起的;
- 为NET_TX_SOFTIRQ软IRQ调度以准备执行;
原因:
//net_tx_action:这个是软中断触发,可由设备在两种情境下以raise_softirq_irqoff(NET_TX_SOFTIRQ)触发:分为两个任务: - 当传输已完成且驱动程序通过dev_kfree_skb_irq通知相关联的缓冲区可释放时,此函数收回那些已成功传输的缓冲区的skb_buff结构:
原因:因设备驱动程序是在中断环境下运行的,所以必须尽快执行,所以它不使用dev_kfree_skb,而是使用dev_kfree_skb_irq,把要被释放的缓冲区指针添加到与
cpu关联的softnet_data中的completion_queue中;然后让net_tx_action去做实际工作; - 当设备的传输通过netif_wake_queue时,要确保所有的条件吻合时,传输帧
- net_tx_action
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45static void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
if (sd->completion_queue) {//回收所有因设备驱动程序调用dev_kfree_skb_irq而被添加到completion_queue列表的缓冲区
struct sk_buff *clist;
local_irq_disable();//在访问softnet_data时必须关闭中断,因中断处理可能会访问该结构
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_enable();
while (clist) {
struct sk_buff *skb = clist;
clist = clist->next;
BUG_TRAP(!atomic_read(&skb->users));
__kfree_skb(skb);
}
}
if (sd->output_queue) {//若该softnet_data有传输设备队列不为空
struct net_device *head;
local_irq_disable();
head = sd->output_queue;//取得头
sd->output_queue = NULL;
local_irq_enable();
while (head) {//循环每个设备看哪个可以获得锁并传输
struct net_device *dev = head;
head = head->next_sched;
smp_mb__before_clear_bit();
clear_bit(__LINK_STATE_SCHED, &dev->state);
if (spin_trylock(&dev->queue_lock)) {//若取得锁,则运行传输
qdisc_run(dev);->__qdisc_run->qdisc_restart->sch_direct_xmit->dev_hard_start_xmit 从而发送出去。
spin_unlock(&dev->queue_lock);
} else {//否则继续调度
netif_schedule(dev);
}
}
}
} - 最后是通过驱动程序的out(io端口函数)发出去的:
例如以下设备类型:
初始化时:1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103/* The Airo-specific entries in the device structure. */
dev->hard_start_xmit = &airo_start_xmit;
static int airo_start_xmit(struct sk_buff *skb, struct net_device *dev) {
s16 len;
int i, j;
struct airo_info *priv = dev->priv;
u32 *fids = priv->fids;
if ( skb == NULL ) {
printk( KERN_ERR "airo: skb == NULL!!!\n" );
return 0;
}
/* Find a vacant FID */
for( i = 0; i < MAX_FIDS / 2 && (fids[i] & 0xffff0000); i++ );
for( j = i + 1; j < MAX_FIDS / 2 && (fids[j] & 0xffff0000); j++ );
if ( j >= MAX_FIDS / 2 ) {
netif_stop_queue(dev);
if (i == MAX_FIDS / 2) {
priv->stats.tx_fifo_errors++;
return 1;
}
}
/* check min length*/
len = ETH_ZLEN < skb->len ? skb->len : ETH_ZLEN;
/* Mark fid as used & save length for later */
fids[i] |= (len << 16);
priv->xmit.skb = skb;
priv->xmit.fid = i;
if (down_trylock(&priv->sem) != 0) {
set_bit(FLAG_PENDING_XMIT, &priv->flags);
netif_stop_queue(dev);
set_bit(JOB_XMIT, &priv->flags);
wake_up_interruptible(&priv->thr_wait);
} else
airo_end_xmit(dev);
return 0;
}
static void airo_end_xmit(struct net_device *dev) {
u16 status;
int i;
struct airo_info *priv = dev->priv;
struct sk_buff *skb = priv->xmit.skb;
int fid = priv->xmit.fid;
u32 *fids = priv->fids;
clear_bit(JOB_XMIT, &priv->flags);
clear_bit(FLAG_PENDING_XMIT, &priv->flags);
status = transmit_802_3_packet (priv, fids[fid], skb->data);
up(&priv->sem);
i = 0;
if ( status == SUCCESS ) {
dev->trans_start = jiffies;
for (; i < MAX_FIDS / 2 && (priv->fids[i] & 0xffff0000); i++);
} else {
priv->fids[fid] &= 0xffff;
priv->stats.tx_window_errors++;
}
if (i < MAX_FIDS / 2)
netif_wake_queue(dev);
dev_kfree_skb(skb);
}
static int transmit_802_3_packet(struct airo_info *ai, int len, char *pPacket)
{
// packet is destination[6], source[6], payload[len-12]
// write the payload length and dst/src/payload
if (bap_setup(ai, txFid, 0x0036, BAP1) != SUCCESS) return ERROR;
/* The hardware addresses aren't counted as part of the payload, so
* we have to subtract the 12 bytes for the addresses off */
payloadLen = cpu_to_le16(len + miclen);
bap_write(ai, &payloadLen, sizeof(payloadLen),BAP1);
bap_write(ai, (const u16*)pPacket, sizeof(etherHead), BAP1);
if (miclen)
bap_write(ai, (const u16*)&pMic, miclen, BAP1);
bap_write(ai, (const u16*)(pPacket + sizeof(etherHead)), len, BAP1);
// issue the transmit command
memset( &cmd, 0, sizeof( cmd ) );
cmd.cmd = CMD_TRANSMIT;
cmd.parm0 = txFid;
。。。
}
static int bap_write(struct airo_info *ai, const u16 *pu16Src,
int bytelen, int whichbap)
{
bytelen = (bytelen + 1) & (~1); // round up to even value
if ( !do8bitIO )
outsw( ai->dev->base_addr+DATA0+whichbap,
pu16Src, bytelen>>1 );
else
outsb( ai->dev->base_addr+DATA0+whichbap, pu16Src, bytelen );
return SUCCESS;
}
netif_schedule和几个基础函数:
1 | 几个基础函数:涉及设备的开启和关闭: |