Backported memfd_create() system call

This commit is contained in:
Nathan 2025-04-07 00:23:38 -05:00
parent 3d1c55ae1e
commit eee847d17d
12 changed files with 258 additions and 1 deletions

View File

@ -411,6 +411,7 @@
#define __NR_renameat2 (__NR_SYSCALL_BASE+382)
#define __NR_seccomp (__NR_SYSCALL_BASE+383)
#define __NR_getrandom (__NR_SYSCALL_BASE+384)
#define __NR_memfd_create (__NR_SYSCALL_BASE+385)
/*
* This may need to be greater than __NR_last_syscall+1 in order to

View File

@ -394,6 +394,7 @@
CALL(sys_ni_syscall) /* CALL(sys_renameat2) */
CALL(sys_seccomp)
CALL(sys_getrandom)
/* 385 */ CALL(sys_memfd_create)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted

View File

@ -362,3 +362,4 @@
353 i386 renameat2 sys_ni_syscall
354 i386 seccomp sys_seccomp
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create

View File

@ -325,6 +325,7 @@
316 common renameat2 sys_ni_syscall
317 common seccomp sys_seccomp
318 common getrandom sys_getrandom
319 common memfd_create sys_memfd_create
#
# x32-specific system call numbers start at 512 to avoid cache impact

View File

@ -21,6 +21,7 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/shmem_fs.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
@ -327,6 +328,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GETPIPE_SZ:
err = pipe_fcntl(filp, cmd, arg);
break;
case F_ADD_SEALS:
case F_GET_SEALS:
err = shmem_fcntl(filp, cmd, arg);
break;
default:
break;
}

View File

@ -1,6 +1,7 @@
#ifndef __SHMEM_FS_H
#define __SHMEM_FS_H
#include <linux/file.h>
#include <linux/swap.h>
#include <linux/mempolicy.h>
#include <linux/pagemap.h>
@ -11,6 +12,7 @@
struct shmem_inode_info {
spinlock_t lock;
unsigned int seals; /* shmem seals */
unsigned long flags;
unsigned long alloced; /* data pages alloced to file */
union {
@ -62,4 +64,19 @@ static inline struct page *shmem_read_mapping_page(
mapping_gfp_mask(mapping));
}
#ifdef CONFIG_TMPFS
extern int shmem_add_seals(struct file *file, unsigned int seals);
extern int shmem_get_seals(struct file *file);
extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
#else
static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
{
return -EINVAL;
}
#endif
#endif

View File

@ -782,6 +782,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_eventfd2(unsigned int count, int flags);
asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,

View File

@ -704,9 +704,11 @@ __SYSCALL(__NR_renameat2, sys_renameat2)
__SYSCALL(__NR_seccomp, sys_seccomp)
#define __NR_getrandom 278
__SYSCALL(__NR_getrandom, sys_getrandom)
#define __NR_memfd_create 279
__SYSCALL(__NR_memfd_create, sys_memfd_create)
#undef __NR_syscalls
#define __NR_syscalls 279
#define __NR_syscalls 280
/*
* All syscalls below here should go away really,

View File

@ -27,6 +27,21 @@
#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
/*
* Set/Get seals
*/
#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
/*
* Types of seals
*/
#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
/* (1U << 31) is reserved for signed error codes */
/*
* Types of directory notifications that may be requested.
*/

View File

@ -0,0 +1,8 @@
#ifndef _UAPI_LINUX_MEMFD_H
#define _UAPI_LINUX_MEMFD_H
/* flags for memfd_create(2) (unsigned int) */
#define MFD_CLOEXEC 0x0001U
#define MFD_ALLOW_SEALING 0x0002U
#endif /* _UAPI_LINUX_MEMFD_H */

View File

@ -193,6 +193,7 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
/* performance counters: */
cond_syscall(sys_perf_event_open);

View File

@ -66,6 +66,9 @@ static struct vfsmount *shm_mnt;
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@ -603,6 +606,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
int error;
error = inode_change_ok(inode, attr);
@ -613,6 +617,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
/* protected by i_mutex */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
if (newsize != oldsize) {
i_size_write(inode, newsize);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@ -1448,6 +1457,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
@ -1501,7 +1511,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
/* i_mutex is held by caller */
if (unlikely(info->seals)) {
if (info->seals & F_SEAL_WRITE)
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
return -EPERM;
}
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
@ -1872,11 +1892,113 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
static int shmem_wait_for_pins(struct address_space *mapping)
{
return 0;
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE)
int shmem_add_seals(struct file *file, unsigned int seals)
{
struct inode *inode = file_inode(file);
struct shmem_inode_info *info = SHMEM_I(inode);
int error;
/*
* SEALING
* Sealing allows multiple parties to share a shmem-file but restrict
* access to a specific subset of file operations. Seals can only be
* added, but never removed. This way, mutually untrusted parties can
* share common memory regions with a well-defined policy. A malicious
* peer can thus never perform unwanted operations on a shared object.
*
* Seals are only supported on special shmem-files and always affect
* the whole underlying inode. Once a seal is set, it may prevent some
* kinds of access to the file. Currently, the following seals are
* defined:
* SEAL_SEAL: Prevent further seals from being set on this file
* SEAL_SHRINK: Prevent the file from shrinking
* SEAL_GROW: Prevent the file from growing
* SEAL_WRITE: Prevent write access to the file
*
* As we don't require any trust relationship between two parties, we
* must prevent seals from being removed. Therefore, sealing a file
* only adds a given set of seals to the file, it never touches
* existing seals. Furthermore, the "setting seals"-operation can be
* sealed itself, which basically prevents any further seal from being
* added.
*
* Semantics of sealing are only defined on volatile files. Only
* anonymous shmem files support sealing. More importantly, seals are
* never written to disk. Therefore, there's no plan to support it on
* other file types.
*/
if (file->f_op != &shmem_file_operations)
return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EPERM;
if (seals & ~(unsigned int)F_ALL_SEALS)
return -EINVAL;
mutex_lock(&inode->i_mutex);
if (info->seals & F_SEAL_SEAL) {
error = -EPERM;
goto unlock;
}
info->seals |= seals;
error = 0;
unlock:
mutex_unlock(&inode->i_mutex);
return error;
}
EXPORT_SYMBOL_GPL(shmem_add_seals);
int shmem_get_seals(struct file *file)
{
if (file->f_op != &shmem_file_operations)
return -EINVAL;
return SHMEM_I(file_inode(file))->seals;
}
EXPORT_SYMBOL_GPL(shmem_get_seals);
long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
long error;
switch (cmd) {
case F_ADD_SEALS:
/* disallow upper 32bit */
if (arg > UINT_MAX)
return -EINVAL;
error = shmem_add_seals(file, arg);
break;
case F_GET_SEALS:
error = shmem_get_seals(file);
break;
default:
error = -EINVAL;
break;
}
return error;
}
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
struct inode *inode = file_inode(file);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_falloc shmem_falloc;
pgoff_t start, index, end;
int error;
@ -1889,6 +2011,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
/* protected by i_mutex */
if (info->seals & F_SEAL_WRITE) {
error = -EPERM;
goto out;
}
shmem_falloc.waitq = &shmem_falloc_waitq;
shmem_falloc.start = unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
@ -1915,6 +2043,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
if (error)
goto out;
if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
error = -EPERM;
goto out;
}
start = offset >> PAGE_CACHE_SHIFT;
end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
/* Try to avoid a swapstorm if len is impossible to satisfy */
@ -2631,6 +2764,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
shmem_show_mpol(seq, sbinfo->mpol);
return 0;
}
#define MFD_NAME_PREFIX "memfd:"
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
struct shmem_inode_info *info;
struct file *file;
int fd, error;
char *name;
long len;
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
return -EINVAL;
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
return -EFAULT;
if (len > MFD_NAME_MAX_LEN + 1)
return -EINVAL;
name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
if (!name)
return -ENOMEM;
strcpy(name, MFD_NAME_PREFIX);
if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
error = -EFAULT;
goto err_name;
}
/* terminating-zero may have changed after strnlen_user() returned */
if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
error = -EFAULT;
goto err_name;
}
fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
if (fd < 0) {
error = fd;
goto err_name;
}
file = shmem_file_setup(name, 0, VM_NORESERVE);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_fd;
}
info = SHMEM_I(file_inode(file));
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_RDWR | O_LARGEFILE;
if (flags & MFD_ALLOW_SEALING)
info->seals &= ~F_SEAL_SEAL;
fd_install(fd, file);
kfree(name);
return fd;
err_fd:
put_unused_fd(fd);
err_name:
kfree(name);
return error;
}
#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)