Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

Vinllen Chen


To be a better coder

ovs datapath笔记

  这几天在学习ovs的源码,现在来做个datapath的笔记。事先声明,本文的内容摘自博客,源码,视频以及自己的总结等,具体参考文献已经在文章末尾给出,主要参考为Baohua Yang的OpenvSwitch 代码分析

0.模块介绍

  datapath为ovs内核模块,负责执行数据交换,也就是把从接收端口收到的数据包在流表中进行匹配,并执行匹配到的动作。
  一个datapath可以对应多个vport,一个vport类似物理交换机的端口概念。一个datapth关联一个flow table,一个flow table包含多个条目,每个条目包括两个内容:一个match/key和一个action,一个match/key可以从包中获取,对应一个action处理行为,最常见的action是在不同flow中进行转发。下图所示的是1个ovs下的几个flow table,以及包在不同flow table进行转发的情况。
flow1
flow1

1.涉及的结构体

1.1 vport模块

vport结构体:

/**
 * struct vport - one port within a datapath
 * @rcu: RCU callback head for deferred destruction.
 * @dp: Datapath to which this port belongs.
 * @upcall_portids: RCU protected 'struct vport_portids'.
 * @port_no: Index into @dp's @ports array.
 * @hash_node: Element in @dev_table hash table in vport.c.
 * @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
 * @ops: Class structure.
 * @percpu_stats: Points to per-CPU statistics used and maintained by vport
 * @err_stats: Points to error statistics used and maintained by vport
 */
struct vport {  
    struct rcu_head rcu;
    struct datapath *dp;
    struct vport_portids __rcu *upcall_portids;
    u16 port_no;

    struct hlist_node hash_node;
    struct hlist_node dp_hash_node;
    const struct vport_ops *ops;

    struct pcpu_sw_netstats __percpu *percpu_stats;

    struct vport_err_stats err_stats;
};

vport_parms结构体,是创建vport所需要的参数结构。

/**
 * struct vport_parms - parameters for creating a new vport
 *
 * @name: New vport's name.
 * @type: New vport's type.
 * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if
 * none was supplied.
 * @dp: New vport's datapath.
 * @port_no: New vport's port number.
 */
struct vport_parms {  
    const char *name;
    enum ovs_vport_type type;
    struct nlattr *options;

    /* For ovs_vport_alloc(). */
    struct datapath *dp;
    u16 port_no;
    struct nlattr *upcall_portids;
};

vport_ops 定义了对vport的操作

/**
 * struct vport_ops - definition of a type of virtual port
 *
 * @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
 * @create: Create a new vport configured as specified.  On success returns
 * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
 * @destroy: Destroys a vport.  Must call vport_free() on the vport but not
 * before an RCU grace period has elapsed.
 * @set_options: Modify the configuration of an existing vport.  May be %NULL
 * if modification is not supported.
 * @get_options: Appends vport-specific attributes for the configuration of an
 * existing vport to a &struct sk_buff.  May be %NULL for a vport that does not
 * have any configuration.
 * @get_name: Get the device's name.
 * @send: Send a packet on the device.  Returns the length of the packet sent,
 * zero for dropped packets or negative for error.
 * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for
 * a packet.
 */
struct vport_ops {  
    enum ovs_vport_type type;

    /* Called with ovs_mutex. */
    struct vport *(*create)(const struct vport_parms *);
    void (*destroy)(struct vport *);

    int (*set_options)(struct vport *, struct nlattr *);
    int (*get_options)(const struct vport *, struct sk_buff *);

    /* Called with rcu_read_lock or ovs_mutex. */
    const char *(*get_name)(const struct vport *);

    int (*send)(struct vport *, struct sk_buff *);
    int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
                   struct ovs_tunnel_info *);
};

vport_ops_list[]是vport_ops组成的数组。vport_ops实例化的全部类型如下

/* List of statically compiled vport implementations.  Don't forget to also
 * add yours to the list at the bottom of vport.h.
 */
static const struct vport_ops *vport_ops_list[] = {  
    &ovs_netdev_vport_ops,
    &ovs_internal_vport_ops,
    &ovs_geneve_vport_ops,
#if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX)
    &ovs_gre_vport_ops,
    &ovs_gre64_vport_ops,
#endif
    &ovs_vxlan_vport_ops,
    &ovs_lisp_vport_ops,
};

比如当我们在为网桥增设端口的时候,就会进入ovs_netdev_vport_ops中的create方法,进而注册网络设备。

1.2 flow模块

  定义维护交换机本地流表相关的数据结构和操作,包括流表结构的创建、更新、删除,对每条流的管理等。位于datapath/flow.h和datapath/flow_table.h中。
flow table:

struct table_instance {  
    struct flex_array *buckets;
    unsigned int n_buckets;
    struct rcu_head rcu;
    int node_ver;
    u32 hash_seed;
    bool keep_flows;
};
struct flow_table {  
    struct table_instance __rcu *ti;
    struct mask_cache_entry __percpu *mask_cache;
    struct mask_array __rcu *mask_array;
    unsigned long last_rehash;
    unsigned int count;
};
struct sw_flow {  
    struct rcu_head rcu;
    struct hlist_node hash_node[2];
    u32 hash;
    int stats_last_writer;      /* NUMA-node id of the last writer on
                     * 'stats[0]'.
                     */
    struct sw_flow_key key;
    struct sw_flow_key unmasked_key;
    struct sw_flow_mask *mask;
    struct sw_flow_actions __rcu *sf_acts;
    struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one
                       * is allocated at flow creation time,
                       * the rest are allocated on demand
                       * while holding the 'stats[0].lock'.
                       */
};

1.3 genl 模块

  datapath运行在内核态,ovsd运行在用户态,两者通过netlink通信。
  因为大量的专用family会占用了family id,而family id数量自身有限(kernel允许32个);同时为了方便用户扩展使用,一个通用的netlink family被定义出来,这就是generic netlink family。
  具体接口可以参考:generic_netlink_howtolibnl库,这两个介绍的很详细。

2. datapath.c

  我们来看一下初始化代码到底做了些什么事情:

static int __init dp_init(void)  
{
    int err;

    BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));

    pr_info("Open vSwitch switching datapath %s, built "__DATE__" "__TIME__"\n",
        VERSION);

    err = action_fifos_init();
    if (err)
        goto error;

    err = ovs_internal_dev_rtnl_link_register();
    if (err)
        goto error_action_fifos_exit;

    err = ovs_flow_init();
    if (err)
        goto error_unreg_rtnl_link;

    err = ovs_vport_init();
    if (err)
        goto error_flow_exit;

    err = register_pernet_device(&ovs_net_ops);
    if (err)
        goto error_vport_exit;

    err = register_netdevice_notifier(&ovs_dp_device_notifier);
    if (err)
        goto error_netns_exit;

    err = dp_register_genl();
    if (err < 0)
        goto error_unreg_notifier;

    return 0;

error_unreg_notifier:  
    unregister_netdevice_notifier(&ovs_dp_device_notifier);
error_netns_exit:  
    unregister_pernet_device(&ovs_net_ops);
error_vport_exit:  
    ovs_vport_exit();
error_flow_exit:  
    ovs_flow_exit();
error_unreg_rtnl_link:  
    ovs_internal_dev_rtnl_link_unregister();
error_action_fifos_exit:  
    action_fifos_exit();
error:  
    return err;
}

  其主要分为以下几部分:

  • action_fifos_init() 其初始化了延迟操作action的队列,这些action在放入队列后将在之后被调用:
int action_fifos_init(void)  
{
    action_fifos = alloc_percpu(struct action_fifo);
    if (!action_fifos)
        return -ENOMEM;

    return 0;
}
#define DEFERRED_ACTION_FIFO_SIZE 10
struct action_fifo {  
    int head;
    int tail;
    /* Deferred action fifo queue storage. */
    struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
};
  • ovs_internal_dev_rtnl_link_register()
    调用rtnl_link_register
int ovs_internal_dev_rtnl_link_register(void)  
{
    return rtnl_link_register(&internal_dev_link_ops);
}
  • ovs_flow_init()
    初始化flow模块
/* Initializes the flow module.
 * Returns zero if successful or a negative error code.
 */
int ovs_flow_init(void)  
{
    BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
    BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));

    flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
                       + (num_possible_nodes()
                      * sizeof(struct flow_stats *)),
                       0, 0, NULL);
    if (flow_cache == NULL)
        return -ENOMEM;

    flow_stats_cache
        = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
                    0, SLAB_HWCACHE_ALIGN, NULL);
    if (flow_stats_cache == NULL) {
        kmem_cache_destroy(flow_cache);
        flow_cache = NULL;
        return -ENOMEM;
    }

    return 0;
}
  • ovs_vport_init()
    初始化vport子系统
/**
 *    ovs_vport_init - initialize vport subsystem
 *
 * Called at module load time to initialize the vport subsystem.
 */
int ovs_vport_init(void)  
{
    dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
                GFP_KERNEL);
    if (!dev_table)
        return -ENOMEM;

    return 0;
}
  • register_pernet_device(&ovs_net_ops)
    注册网络名字空间设备。
  • register_netdevice_notifier(&ovs_dp_device_notifier)
    注册设备通知事件。
  • dp_register_genl() 初始化dp相关的netlink族。
static int dp_register_genl(void)  
{
    int err;
    int i;

    for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {

        err = genl_register_family(dp_genl_families[i]);
        if (err)
            goto error;
    }

    return 0;

error:  
    dp_unregister_genl(i);
    return err;
}

dp_genl_families[]数组静态定义如下:

static struct genl_family *dp_genl_families[] = {  
    &dp_datapath_genl_family,
    &dp_vport_genl_family,
    &dp_flow_genl_family,
    &dp_packet_genl_family,
};

调用dp_register_genl()完成对四种类型的family以及相应操作的注册,包括datapath、vport、flow和packet。前三种 family都对应四种操 作都包括NEW、DEL、GET、SET,而packet的操作仅为EXECUTE。

3. 添加网桥命令产生的代码流

  1. 键入命令 ovs-vsctl add-br testBr
  2. 内核中的 openvswitch.ko 收到一个添加网桥的指令时候,即接收到OVS_DATAPATH_FAMILY 通道的 OVS_DP_CMD_NEW 命令。
    该命令绑定的回调函数为ovs_dp_cmd_new。
  3. ovs_dp_cmd_new
    函数除了初始化 dp 结构外,还调用 new_vport()函数来生成一个新的 vport。
  4. new_port
    new_port 函数则调用 ovs_vport_add()函数,来尝试生成一个新的 vport。
  5. ovs_vport_add
    ovs_vport_add()函数会检查 vport 类型,并调用相关的 create()函数来生成 vport 结构。
  6. 可见,当 dp 是网络设备时(vport-netdev.c),最终由ovs_vport_add()函数调用的是netdev_create()函数,而 netdev_create()函数中最关键的一步是利用netdev_rx_handler_register注册了收到网包时的回调函数。
    netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data)是linux内核实现的函数,为网络设备dev注册一个receive handler,rx_handler_data指向的是这个receive handler是用的内存区域。这个handler以后会被 __netif_receive_skb()呼叫。 也就是说netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, vport);在收到packet后会调用 netdev_frame_hook处理。

4.packet接收处理

  继续上面所说的,当接收包将会发生如下代码流:

  1. netif_receive_skb
    netif_receive_skb(struct sk_buff *skb)从网络中接收数据,它是主要的接收数据处理函数,总是成功,这个buffer在拥塞处理或协议层的时候可能被丢弃。这个函数只能从软中断环境(softirq context)中调用,并且中断允许。返回值NET_RX_SUCCESS表示没有拥塞,NET_RX_DROP包丢弃。
  2. netdev_frame_hook()
    其调用netdev_port_receive()
  3. netdev_port_receive()
    函数netdev_port_receive()首先检查是否skb被共享,若是则得到一个packet的拷贝。 其调用ovs_vport_receive()。检查包的校验和,然后交付给我们的vport通用层来处理。
  4. ovs_vport_receive()
    将收到的packet传给datapath处理。 其调用ovs_dp_process_received_packet()
  5. ovs_dp_process_received_packet()
    ovs_dp_process_received_packet()(datapath/datapath.c)中进行复杂的包处理过程,进行流查表,查表后执行对应的行为。当查找失败时候,使用ovs_dp_upcall()发送 upcall到用户空间(ovs-vswitchd)。此后处理过程交给 ovsd 处理。其将产生以下代码流: ovs_dp_process_received_packet() => ovs_dp_upcall() => queue_userspace_packet()
    本步骤具体内容可以参考我的另外一篇博客:ovs中流表在内核空间与用户空间的匹配过程

5. datapath与ovsd的通信机制

  这些family和操作的定义均在datapath.c中。以 flow family 为例。代码为:
nla_policy

static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {  
    [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
    [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED },
    [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
    [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
    [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG },
};

对generic netlink中flow的操作:

static struct genl_ops dp_flow_genl_ops[] = {  
    { .cmd = OVS_FLOW_CMD_NEW,
      .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
      .policy = flow_policy,
      .doit = ovs_flow_cmd_new
    },
    { .cmd = OVS_FLOW_CMD_DEL,
      .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
      .policy = flow_policy,
      .doit = ovs_flow_cmd_del
    },
    { .cmd = OVS_FLOW_CMD_GET,
      .flags = 0,           /* OK for unprivileged users. */
      .policy = flow_policy,
      .doit = ovs_flow_cmd_get,
      .dumpit = ovs_flow_cmd_dump
    },
    { .cmd = OVS_FLOW_CMD_SET,
      .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
      .policy = flow_policy,
      .doit = ovs_flow_cmd_set,
    },
};

generic netlink中的flow的genl_family初始化:

static struct genl_family dp_flow_genl_family = {  
    .id = GENL_ID_GENERATE,
    .hdrsize = sizeof(struct ovs_header),
    .name = OVS_FLOW_FAMILY,
    .version = OVS_FLOW_VERSION,
    .maxattr = OVS_FLOW_ATTR_MAX,
    .netnsok = true,
    .parallel_ops = true,
    .ops = dp_flow_genl_ops,
    .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
    .mcgrps = &ovs_dp_flow_multicast_group,
    .n_mcgrps = 1,
};

  ovsd对于netlink的实现,主要在lib/netlink-socket.c文件中。而对这些netlink操作的调用,主要在lib/dpif-netlink.c文件中对于各个行为的处理,各种可能的消息类型在datapath模块中事先进行了内核注册。
  datapath中对netlink family类型进行了注册,ovsd在使用这些netlink family之前需要获取它们的信息,这一过程主要在lib/dpif-netlink.c文件,dpif_linux_init()函数。代码为:

static int  
dpif_netlink_init(void)  
{
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
    static int error;

    if (ovsthread_once_start(&once)) {
        error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
                                      &ovs_datapath_family);
        if (error) {
            VLOG_ERR("Generic Netlink family '%s' does not exist. "
                     "The Open vSwitch kernel module is probably not loaded.",
                     OVS_DATAPATH_FAMILY);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
        }
        if (!error) {
            error = nl_lookup_genl_family(OVS_PACKET_FAMILY,
                                          &ovs_packet_family);
        }
        if (!error) {
            error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
                                           &ovs_vport_mcgroup);
        }

        ovsthread_once_done(&once);
    }

    return error;
}

  其中nl_lookup_genl_family代码位于lib/netlink-socket.c中,其作用为查找给定namenetlink family类型是否完成注册,并返回对应的*number值,该值可以直接使用。

/* If '*number' is 0, translates the given Generic Netlink family 'name' to a
 * number and stores it in '*number'.  If successful, returns 0 and the caller
 * may use '*number' as the family number.  On failure, returns a positive
 * errno value and '*number' caches the errno value. */
int  
nl_lookup_genl_family(const char *name, int *number)  
{
    if (*number == 0) {
        struct nlattr *attrs[ARRAY_SIZE(family_policy)];
        struct ofpbuf *reply;
        int error;

        error = do_lookup_genl_family(name, attrs, &reply);
        if (!error) {
            *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
            define_genl_family(*number, name);
        } else {
            *number = -error;
        }
        ofpbuf_delete(reply);

        ovs_assert(*number != 0);
    }
    return *number > 0 ? 0 : -*number;
}

  完成这些查找后,ovsd即可利用dpif中的api,通过发出这些netlink消息给datapath实现对datapath的操作。
  相关的中间层API定义主要在dpif_class(位于 lib/dpif-provider.h)的抽象类型中。下面是关于dpif_class结构体的注释:

/* Datapath interface class structure, to be defined by each implementation of
 * a datapath interface.
 *
 * These functions return 0 if successful or a positive errno value on failure,
 * except where otherwise noted.
 *
 * These functions are expected to execute synchronously, that is, to block as
 * necessary to obtain a result.  Thus, they may not return EAGAIN or
 * EWOULDBLOCK or EINPROGRESS.  We may relax this requirement in the future if
 * and when we encounter performance problems. */

  一共有两种dpif_class实例化类型,分别为dpif_netlink_classdpif_netdev_classdpif_netlink_class表示的是通过netlink和本地的datapath通信,而dpif_netdev_class通过网络协议和远程的datapath通信。
  下图是ovsd使用netlink进行消息发送的过程:
flow4

https://www.kernel.org/doc/Documentation/networking/openvswitch.txt
https://github.com/openvswitch/ovs
http://www.ibm.com/developerworks/cn/cloud/library/1401_zhaoyi_openswitch/
http://blog.csdn.net/vonzhoufz/article/details/19840683
http://lxin.org/linux%20kernel/2014/04/06/ovs-datapah-implementation/
https://www.youtube.com/watch?v=TD5wmoD7XOE
http://www.sdnap.com/wp-content/uploads/2013/12/ovs_code.pdf


About the author

vinllen chen

Beijing, China

格物致知


Discussions

comments powered by Disqus