Signed-Off-By: Andrea Arcangeli <andrea@cpushare.com>

Index: linux-2.5/arch/i386/Kconfig
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/Kconfig,v
retrieving revision 1.131
diff -u -p -r1.131 Kconfig
--- linux-2.5/arch/i386/Kconfig	13 Sep 2004 18:32:00 -0000	1.131
+++ linux-2.5/arch/i386/Kconfig	12 Oct 2004 01:04:09 -0000
@@ -33,6 +33,10 @@ config GENERIC_IOMAP
 	bool
 	default y
 
+config SECCOMP
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "Processor type and features"
Index: linux-2.5/arch/i386/kernel/entry.S
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/kernel/entry.S,v
retrieving revision 1.90
diff -u -p -r1.90 entry.S
--- linux-2.5/arch/i386/kernel/entry.S	8 Sep 2004 14:49:36 -0000	1.90
+++ linux-2.5/arch/i386/kernel/entry.S	12 Oct 2004 00:52:07 -0000
@@ -157,12 +157,19 @@ do_lcall:
 	movl %edx,EIP(%ebp)	# Now we move them to their "normal" places
 	movl %ecx,CS(%ebp)	#
 	GET_THREAD_INFO_WITH_ESP(%ebp)	# GET_THREAD_INFO
+	/* call gates cannot run with SECCOMP enabled */
+	testw $_TIF_SECCOMP,TI_flags(%ebp)
+	jnz sigkill
 	movl TI_exec_domain(%ebp), %edx	# Get the execution domain
 	call *EXEC_DOMAIN_handler(%edx)	# Call the handler for the domain
 	addl $4, %esp
 	popl %eax
 	jmp resume_userspace
 
+sigkill:
+	pushl $9
+	call do_exit		
+
 ENTRY(lcall27)
 	pushfl			# We get a different stack layout with call
 				# gates, which has to be cleaned up later..
@@ -256,7 +263,7 @@ sysenter_past_esp:
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -279,7 +286,7 @@ ENTRY(system_call)
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
Index: linux-2.5/arch/i386/kernel/ptrace.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/kernel/ptrace.c,v
retrieving revision 1.26
diff -u -p -r1.26 ptrace.c
--- linux-2.5/arch/i386/kernel/ptrace.c	23 Aug 2004 19:40:02 -0000	1.26
+++ linux-2.5/arch/i386/kernel/ptrace.c	12 Oct 2004 00:52:07 -0000
@@ -15,6 +15,7 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
+#include <linux/seccomp.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -530,6 +531,10 @@ out:
 __attribute__((regparm(3)))
 void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	/* do the secure computing check first */
+	if (unlikely(test_thread_flag(TIF_SECCOMP)))
+		secure_computing(regs->orig_eax);
+
 	if (unlikely(current->audit_context)) {
 		if (!entryexit)
 			audit_syscall_entry(current, regs->orig_eax,
Index: linux-2.5/arch/x86_64/Kconfig
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/Kconfig,v
retrieving revision 1.56
diff -u -p -r1.56 Kconfig
--- linux-2.5/arch/x86_64/Kconfig	6 Oct 2004 15:14:04 -0000	1.56
+++ linux-2.5/arch/x86_64/Kconfig	12 Oct 2004 01:01:18 -0000
@@ -82,6 +82,10 @@ config GENERIC_IOMAP
 	bool
 	default y
 
+config SECCOMP
+	bool
+	default y
+
 source "init/Kconfig"
 
 
Index: linux-2.5/arch/x86_64/ia32/ia32entry.S
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/ia32/ia32entry.S,v
retrieving revision 1.39
diff -u -p -r1.39 ia32entry.S
--- linux-2.5/arch/x86_64/ia32/ia32entry.S	31 Aug 2004 17:35:25 -0000	1.39
+++ linux-2.5/arch/x86_64/ia32/ia32entry.S	12 Oct 2004 00:52:07 -0000
@@ -78,7 +78,7 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
-	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz  sysenter_tracesys
 sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls),%eax
@@ -163,7 +163,7 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz   cstar_tracesys
 cstar_do_call:	
 	cmpl $IA32_NR_syscalls,%eax
@@ -236,7 +236,7 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz ia32_tracesys
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls),%eax
Index: linux-2.5/arch/x86_64/kernel/entry.S
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/kernel/entry.S,v
retrieving revision 1.24
diff -u -p -r1.24 entry.S
--- linux-2.5/arch/x86_64/kernel/entry.S	24 Aug 2004 18:20:09 -0000	1.24
+++ linux-2.5/arch/x86_64/kernel/entry.S	12 Oct 2004 00:52:07 -0000
@@ -185,7 +185,7 @@ ENTRY(system_call)
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)  
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
 	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
Index: linux-2.5/arch/x86_64/kernel/ptrace.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/kernel/ptrace.c,v
retrieving revision 1.18
diff -u -p -r1.18 ptrace.c
--- linux-2.5/arch/x86_64/kernel/ptrace.c	24 Aug 2004 18:20:09 -0000	1.18
+++ linux-2.5/arch/x86_64/kernel/ptrace.c	12 Oct 2004 00:52:07 -0000
@@ -17,6 +17,7 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
+#include <linux/seccomp.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -519,6 +520,10 @@ static void syscall_trace(struct pt_regs
 
 asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 {
+	/* do the secure computing check first */
+	if (unlikely(test_thread_flag(TIF_SECCOMP)))
+		secure_computing(regs->orig_rax);
+
 	if (unlikely(current->audit_context))
 		audit_syscall_entry(current, regs->orig_rax,
 				    regs->rdi, regs->rsi,
Index: linux-2.5/fs/proc/base.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/fs/proc/base.c,v
retrieving revision 1.82
diff -u -p -r1.82 base.c
--- linux-2.5/fs/proc/base.c	8 Sep 2004 14:50:33 -0000	1.82
+++ linux-2.5/fs/proc/base.c	12 Oct 2004 01:02:59 -0000
@@ -32,6 +32,9 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#ifdef CONFIG_SECCOMP
+#include <linux/seccomp.h>
+#endif
 
 /*
  * For hysterical raisins we keep the same inumbers as in the old procfs.
@@ -48,6 +51,9 @@ enum pid_directory_inos {
 	PROC_TGID_TASK,
 	PROC_TGID_STATUS,
 	PROC_TGID_MEM,
+#ifdef CONFIG_SECCOMP
+	PROC_TGID_SECCOMP,
+#endif
 	PROC_TGID_CWD,
 	PROC_TGID_ROOT,
 	PROC_TGID_EXE,
@@ -74,6 +80,9 @@ enum pid_directory_inos {
 	PROC_TID_INO,
 	PROC_TID_STATUS,
 	PROC_TID_MEM,
+#ifdef CONFIG_SECCOMP
+	PROC_TID_SECCOMP,
+#endif
 	PROC_TID_CWD,
 	PROC_TID_ROOT,
 	PROC_TID_EXE,
@@ -119,6 +128,9 @@ static struct pid_entry tgid_base_stuff[
 	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+#ifdef CONFIG_SECCOMP
+	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
+#endif
 	E(PROC_TGID_CWD,       "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
@@ -144,6 +156,9 @@ static struct pid_entry tid_base_stuff[]
 	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
 	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
 	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
+#ifdef CONFIG_SECCOMP
+	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
+#endif
 	E(PROC_TID_CWD,        "cwd",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
@@ -701,6 +716,60 @@ static struct inode_operations proc_mem_
 	.permission	= proc_permission,
 };
 
+#ifdef CONFIG_SECCOMP
+static ssize_t seccomp_read(struct file * file, char * buf,
+			    size_t count, loff_t *ppos)
+{
+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20];
+	loff_t __ppos = *ppos;
+	size_t len;
+
+	len = sprintf(__buf, "%u\n", tsk->seccomp_mode) + 1;
+	if (__ppos >= len)
+		return 0;
+	if (count > len-__ppos)
+		count = len-__ppos;
+	if (copy_to_user(buf, __buf + __ppos, count))
+		return -EFAULT;
+	*ppos += count;
+	return count;
+}
+
+static ssize_t seccomp_write(struct file * file, const char * buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct * tsk = proc_task(file->f_dentry->d_inode);
+	char __buf[20], * end;
+	unsigned int seccomp_mode;
+
+	/* can set it only once to be even more secure */
+	if (unlikely(tsk->seccomp_mode))
+		return -EPERM;
+
+	memset(__buf, 0, 20);
+	if (count > 19)
+		count = 19;
+	if (copy_from_user(__buf, buf, count))
+		return -EFAULT;
+	seccomp_mode = simple_strtoul(__buf, &end, 0);
+	if (*end == '\n')
+		end++;
+	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+		tsk->seccomp_mode = seccomp_mode;
+		set_tsk_thread_flag(tsk, TIF_SECCOMP);
+	}
+	if (unlikely(!(end - __buf)))
+		return -EIO;
+	return end - __buf;
+}
+
+static struct file_operations proc_seccomp_operations = {
+	.read		= seccomp_read,
+	.write		= seccomp_write,
+};
+#endif /* CONFIG_SECCOMP */
+
 static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1338,6 +1407,12 @@ static struct dentry *proc_pident_lookup
 			inode->i_op = &proc_mem_inode_operations;
 			inode->i_fop = &proc_mem_operations;
 			break;
+#ifdef CONFIG_SECCOMP
+		case PROC_TID_SECCOMP:
+		case PROC_TGID_SECCOMP:
+			inode->i_fop = &proc_seccomp_operations;
+			break;
+#endif /* CONFIG_SECCOMP */
 		case PROC_TID_MOUNTS:
 		case PROC_TGID_MOUNTS:
 			inode->i_fop = &proc_mounts_operations;
Index: linux-2.5/include/asm-i386/thread_info.h
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/include/asm-i386/thread_info.h,v
retrieving revision 1.21
diff -u -p -r1.21 thread_info.h
--- linux-2.5/include/asm-i386/thread_info.h	23 Aug 2004 19:36:54 -0000	1.21
+++ linux-2.5/include/asm-i386/thread_info.h	12 Oct 2004 00:52:07 -0000
@@ -144,6 +144,7 @@ static inline unsigned long current_stac
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+#define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -153,12 +154,14 @@ static inline unsigned long current_stac
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
-#define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
+/* work to do on any return to u-space */
+#define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
 /*
  * Thread-synchronous status.
Index: linux-2.5/include/asm-x86_64/thread_info.h
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/include/asm-x86_64/thread_info.h,v
retrieving revision 1.17
diff -u -p -r1.17 thread_info.h
--- linux-2.5/include/asm-x86_64/thread_info.h	12 Apr 2004 20:29:12 -0000	1.17
+++ linux-2.5/include/asm-x86_64/thread_info.h	12 Oct 2004 00:52:07 -0000
@@ -102,6 +102,7 @@ static inline struct thread_info *stack_
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+#define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -114,6 +115,7 @@ static inline struct thread_info *stack_
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
@@ -121,9 +123,9 @@ static inline struct thread_info *stack_
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
 /* work to do on any return to user space */
-#define _TIF_ALLWORK_MASK 0x0000FFFF	
+#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
 #define PREEMPT_ACTIVE     0x4000000
 
Index: linux-2.5/include/linux/sched.h
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/include/linux/sched.h,v
retrieving revision 1.273
diff -u -p -r1.273 sched.h
--- linux-2.5/include/linux/sched.h	5 Oct 2004 23:44:44 -0000	1.273
+++ linux-2.5/include/linux/sched.h	12 Oct 2004 00:52:07 -0000
@@ -549,6 +549,7 @@ struct task_struct {
 	
 	void *security;
 	struct audit_context *audit_context;
+	unsigned int seccomp_mode;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
Index: linux-2.5/kernel/Makefile
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/kernel/Makefile,v
retrieving revision 1.43
diff -u -p -r1.43 Makefile
--- linux-2.5/kernel/Makefile	4 Sep 2004 23:18:26 -0000	1.43
+++ linux-2.5/kernel/Makefile	12 Oct 2004 00:52:07 -0000
@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o
+	    kthread.o seccomp.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
--- /dev/null	2004-04-06 15:27:52.000000000 +0200
+++ linux-2.5/kernel/seccomp.c	2004-10-12 02:52:07.106845552 +0200
@@ -0,0 +1,74 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004  Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * This defines a simple but solid secure-computing mode.
+ */
+
+#include <linux/seccomp.h>
+#include <linux/sched.h>
+#include <asm/unistd.h>
+#ifdef TIF_IA32
+#include <asm/ia32_unistd.h>
+#endif
+
+/* #define SECCOMP_DEBUG 1 */
+
+/*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+	__NR_read, __NR_write, __NR_exit,
+	/*
+	 * Allow either sigreturn or rt_sigreturn, newer archs
+	 * like x86-64 only defines __NR_rt_sigreturn.
+	 */
+#ifdef __NR_sigreturn
+	__NR_sigreturn,
+#else
+	__NR_rt_sigreturn,
+#endif
+	0, /* null terminated */
+};
+
+#ifdef TIF_IA32
+static int mode1_syscalls_32bit[] = {
+	__NR_ia32_read, __NR_ia32_write, __NR_ia32_exit,
+	/*
+	 * Allow either sigreturn or rt_sigreturn, newer archs
+	 * like x86-64 only defines __NR_rt_sigreturn.
+	 */
+	__NR_ia32_sigreturn,
+	0, /* null terminated */
+};
+#endif
+
+void secure_computing(int this_syscall)
+{
+	int mode = current->seccomp_mode;
+	int * syscall;
+
+	switch (mode) {
+	case 1:
+		syscall = mode1_syscalls;
+#ifdef TIF_IA32
+		if (test_thread_flag(TIF_IA32))
+			syscall = mode1_syscalls_32bit;
+#endif
+		do {
+			if (*syscall == this_syscall)
+				return;
+		} while (*++syscall);
+		break;
+	default:
+		BUG();
+	}
+
+#ifdef SECCOMP_DEBUG
+	dump_stack();
+#endif
+	do_exit(SIGKILL);
+}
--- /dev/null	2004-04-06 15:27:52.000000000 +0200
+++ linux-2.5/include/linux/seccomp.h	2004-10-12 02:52:07.105845704 +0200
@@ -0,0 +1,8 @@
+#ifndef _LINUX_SECCOMP_H
+#define _LINUX_SECCOMP_H
+
+#define NR_SECCOMP_MODES 1
+
+extern void secure_computing(int);
+
+#endif /* _LINUX_SECCOMP_H */