介绍
The Kernel NIC Interface (KNI) is a DPDK control plane solution that allows userspace applications to exchange packets with the kernel networking stack. To accomplish this, DPDK userspace applications use an IOCTL call to request the creation of a KNI virtual device in the Linux* kernel. The IOCTL call provides interface information and the DPDK’s physical address space, which is re-mapped into the kernel address space by the KNI kernel loadable module that saves the information to a virtual device context. The DPDK creates FIFO queues for packet ingress and egress to the kernel module for each device allocated.
The KNI kernel loadable module is a standard net driver, which upon receiving the IOCTL call access the DPDK’s FIFO queue to receive/transmit packets from/to the DPDK userspace application. The FIFO queues contain pointers to data packets in the DPDK. This:
Provides a faster mechanism to interface with the kernel net stack and eliminates system calls
Facilitates the DPDK using standard Linux* userspace net tools (tcpdump, ftp, and so on)
Eliminate the copy_to_user and copy_from_user operations on packets.
测试
Load KNI kernel module:
# insmod /root/dpdk/x86/lib/modules/3.10.0-514.el7.x86_64/extra/dpdk/rte_kni.ko
Build KNI application:
# export RTE_SDK=/root/dpdk/x86/share/dpdk
# cd examples/kni
# make
CC main.o
LD kni
INSTALL-APP kni
INSTALL-MAP kni.map
Run KNI application:
# build/kni -c 0x0f -n 2 -- -P -p 0x3 --config="(0,0,1),(1,2,3)"
EAL: Detected 4 lcore(s)
EAL: No free hugepages reported in hugepages-1048576kB
EAL: Probing VFIO support...
EAL: WARNING: cpu flags constant_tsc=yes nonstop_tsc=no -> using unreliable clock cycles !
EAL: PCI device 0000:00:05.0 on NUMA socket -1
EAL: probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:06.0 on NUMA socket -1
EAL: probe driver: 8086:100e net_e1000_em
EAL: PCI device 0000:00:07.0 on NUMA socket -1
EAL: probe driver: 8086:100e net_e1000_em
APP: Initialising port 0 ...
KNI: pci: 00:06:00 8086:100e
APP: Initialising port 1 ...
KNI: pci: 00:07:00 8086:100e
Checking link status
.....done
Port 0 Link Up - speed 1000 Mbps - full-duplex
Port 1 Link Up - speed 1000 Mbps - full-duplex
APP: Lcore 1 is writing to port 0
APP: Lcore 2 is reading from port 1
APP: Lcore 0 is reading from port 0
APP: Lcore 3 is writing to port 1
...
其中,
-c = core bitmask
-P = promiscuous mode
-p = port hex bitmask
–config=”(port, lcore_rx, lcore_tx [,lcore_kthread, …]) …”
Note that each core can do either TX or RX for one port only.
[root@vm01 ~]# ip a
...
7: vEth0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
link/ether ba:92:66:e5:2f:35 brd ff:ff:ff:ff:ff:ff
8: vEth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN qlen 1000
link/ether b2:64:67:2f:32:4a brd ff:ff:ff:ff:ff:ff
[root@vm01 ~]# ip addr add 10.0.10.30/24 dev vEth0
[root@vm01 ~]# ip link set vEth0 up
[root@vm03 ~]# ping -c 3 10.0.10.30
PING 10.0.10.30 (10.0.10.30) 56(84) bytes of data.
64 bytes from 10.0.10.30: icmp_seq=1 ttl=64 time=14.2 ms
64 bytes from 10.0.10.30: icmp_seq=2 ttl=64 time=2.96 ms
64 bytes from 10.0.10.30: icmp_seq=3 ttl=64 time=1.89 ms
给kni应用进程发送SIGUSR1
信号,kni应用进程会输出统计信息:
[root@vm01 ~]# pkill -10 kni
...
**KNI example application statistics**
====== ============== ============ ============ ============ ============
Port Lcore(RX/TX) rx_packets rx_dropped tx_packets tx_dropped
------ -------------- ------------ ------------ ------------ ------------
0 0/ 1 23 0 5 0
1 2/ 3 1 0 0 0
====== ============== ============ ============ ============ ============
实现
相关代码:
KNI示例程序位于example/kni
,KNI内核模块位于lib/librte_eal/linuxapp/kni
,KNI library位于lib/librte_kni
。
整体实现如下:
数据接收
先调用rte_eth_rx_burst
从网络接口读取数据,然后调用rte_kni_tx_burst
通过FIFO
传给内核模块。
/* Burst rx from eth */
nb_rx = rte_eth_rx_burst(port_id, 0, pkts_burst, PKT_BURST_SZ);
/* Burst tx to kni */
num = rte_kni_tx_burst(p->kni[i], pkts_burst, nb_rx);
rte_kni_tx_burst:
///librte_kni
unsigned
rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
{
void *phy_mbufs[num];
unsigned int ret;
unsigned int i;
for (i = 0; i < num; i++)
phy_mbufs[i] = va2pa(mbufs[i]);
ret = kni_fifo_put(kni->rx_q, phy_mbufs, num);
/* Get mbufs from free_q and then free them */
kni_free_mbufs(kni);
return ret;
}
/**
* Adds num elements into the fifo. Return the number actually written
*/
static inline unsigned
kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
{
unsigned i = 0;
unsigned fifo_write = fifo->write;
unsigned fifo_read = fifo->read;
unsigned new_write = fifo_write;
for (i = 0; i < num; i++) {
new_write = (new_write + 1) & (fifo->len - 1);
if (new_write == fifo_read)
break;
fifo->buffer[fifo_write] = data[i];
fifo_write = new_write;
}
fifo->write = fifo_write;
return i;
}
- fifo
DPDK应用通过fifo
与内核模块交换数据,fifo
实际上是一块环形共享内存:
/*
* Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
* Write and read should wrap around. Fifo is empty when write == read
* Writing should never overwrite the read position
*/
struct rte_kni_fifo {
volatile unsigned write; /**< Next position to be written*/
volatile unsigned read; /**< Next position to be read */
unsigned len; /**< Circular buffer length */
unsigned elem_size; /**< Pointer size - for 32/64 bit OS */
void *volatile buffer[]; /**< The buffer contains mbuf pointers */
};
DPDK应用程序在初始化时,需要将fifo
共享内存的地址告诉KNI内核模块:
struct rte_kni *
rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
const struct rte_kni_conf *conf,
struct rte_kni_ops *ops)
{
///...
/* TX RING */
mz = slot->m_tx_q;
ctx->tx_q = mz->addr;
kni_fifo_init(ctx->tx_q, KNI_FIFO_COUNT_MAX);
dev_info.tx_phys = mz->phys_addr;
/* RX RING */
mz = slot->m_rx_q;
ctx->rx_q = mz->addr;
kni_fifo_init(ctx->rx_q, KNI_FIFO_COUNT_MAX);
dev_info.rx_phys = mz->phys_addr;
///...
ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info); ///内核模块
}
KNI kernel module
static int
kni_ioctl(struct inode *inode, uint32_t ioctl_num, unsigned long ioctl_param)
{
///..
case _IOC_NR(RTE_KNI_IOCTL_CREATE):
ret = kni_ioctl_create(net, ioctl_num, ioctl_param);
kni_ioctl_create
会创建对应的网络设备vEthX
,然后设置对应的fifo
共享内存,并启动对应的内核线程:
static int
kni_ioctl_create(struct net *net, uint32_t ioctl_num,
unsigned long ioctl_param)
{
///...
net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name,
#ifdef NET_NAME_USER
NET_NAME_USER,
#endif
kni_net_init);
///...
/* Translate user space info into kernel space info */
kni->tx_q = phys_to_virt(dev_info.tx_phys);
kni->rx_q = phys_to_virt(dev_info.rx_phys);
kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
kni->free_q = phys_to_virt(dev_info.free_phys);
///...
ret = kni_run_thread(knet, kni, dev_info.force_bind);
///...
}
kernel thread:
KNI内核线程不断从fifo
共享内存读取数据,然后交给内核协议栈继续处理:
static int
kni_thread_single(void *data)
{
struct kni_net *knet = data;
int j;
struct kni_dev *dev;
while (!kthread_should_stop()) {
down_read(&knet->kni_list_lock);
for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
list_for_each_entry(dev, &knet->kni_list_head, list) {
kni_net_rx(dev);
kni_net_poll_resp(dev);
}
}
up_read(&knet->kni_list_lock);
///...
}
/* rx interface */
void
kni_net_rx(struct kni_dev *kni)
{
/**
* It doesn't need to check if it is NULL pointer,
* as it has a default value
*/
(*kni_net_rx_func)(kni); ///kni_net_rx_normal
}
- kni_net_rx_func
kni_net_rx_func
从fifo
共享内存读取数据,然后分配skb,拷贝数据,调用netif_rx_ni
进入内核协议栈:
static void kni_net_rx_normal(struct kni_dev *kni);
/* kni rx function pointer, with default to normal rx */
static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
/*
* RX: normal working mode
*/
static void
kni_net_rx_normal(struct kni_dev *kni)
{
///...
/* Calculate the number of entries to dequeue from rx_q */
num_rx = min_t(uint32_t, num_fq, MBUF_BURST_SZ);
/* Burst dequeue from rx_q */
num_rx = kni_fifo_get(kni->rx_q, kni->pa, num_rx);
if (num_rx == 0)
return;
///...
/* Transfer received packets to netif */
for (i = 0; i < num_rx; i++) {
kva = pa2kva(kni->pa[i]);
len = kva->pkt_len;
data_kva = kva2data_kva(kva);
kni->va[i] = pa2va(kni->pa[i], kva);
skb = dev_alloc_skb(len + 2);
///...
skb->dev = dev;
skb->protocol = eth_type_trans(skb, dev);
skb->ip_summed = CHECKSUM_UNNECESSARY;
/* Call netif interface */
netif_rx_ni(skb); ///进入内核协议栈
///...
}
///...
}
发送数据
- KNI kernel interface
/*
* Transmit a packet (called by the kernel)
*/
static int
kni_net_tx(struct sk_buff *skb, struct net_device *dev)
{
///...
/* dequeue a mbuf from alloc_q */
ret = kni_fifo_get(kni->alloc_q, &pkt_pa, 1);
if (likely(ret == 1)) {
void *data_kva;
pkt_kva = pa2kva(pkt_pa);
data_kva = kva2data_kva(pkt_kva);
pkt_va = pa2va(pkt_pa, pkt_kva);
len = skb->len; /// data length
memcpy(data_kva, skb->data, len); /// copy data
if (unlikely(len < ETH_ZLEN)) {
memset(data_kva + len, 0, ETH_ZLEN - len);
len = ETH_ZLEN;
}
pkt_kva->pkt_len = len;
pkt_kva->data_len = len;
/* enqueue mbuf into tx_q */
ret = kni_fifo_put(kni->tx_q, &pkt_va, 1);/// put tx_q
///...
- DPDK app
/* Burst rx from kni */
num = rte_kni_rx_burst(p->kni[i], pkts_burst, PKT_BURST_SZ);
/* Burst tx to eth */
nb_tx = rte_eth_tx_burst(port_id, 0, pkts_burst, (uint16_t)num);
rte_kni_rx_burst
从tx_q
队列取出mbuf:
unsigned
rte_kni_rx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
{
unsigned ret = kni_fifo_get(kni->tx_q, (void **)mbufs, num);
/* If buffers removed, allocate mbufs and then put them into alloc_q */
if (ret)
kni_allocate_mbufs(kni);
return ret;
}