LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [take28-resend_2->0 3/8] kevent: poll/select() notifications.
  2006-12-17 13:53     ` [take28-resend_2->0 2/8] kevent: Core files Evgeniy Polyakov
@ 2006-12-17 13:53       ` Evgeniy Polyakov
  2006-12-17 13:53         ` [take28-resend_2->0 4/8] kevent: Socket notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


poll/select() notifications.

This patch includes generic poll/select notifications.
kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake, a lot of allocations and so on).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/file_table.c b/fs/file_table.c
index bc35a40..0805547 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/kevent.h>
 #include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
@@ -119,6 +120,7 @@ struct file *get_empty_filp(void)
 	f->f_uid = tsk->fsuid;
 	f->f_gid = tsk->fsgid;
 	eventpoll_init_file(f);
+	kevent_init_file(f);
 	/* f->f_version: 0 */
 	return f;
 
@@ -164,6 +166,7 @@ void fastcall __fput(struct file *file)
 	 * in the file cleanup chain.
 	 */
 	eventpoll_release(file);
+	kevent_cleanup_file(file);
 	locks_remove_flock(file);
 
 	if (file->f_op && file->f_op->release)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5baf3a1..8bbf3a5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -276,6 +276,7 @@ extern int dir_notify_enable;
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -586,6 +587,10 @@ struct inode {
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -739,6 +744,9 @@ struct file {
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..7ccf7da
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,234 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait,
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont =
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead,
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k =
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, GFP_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err;
+	unsigned int revents;
+	unsigned long flags;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -EBADF;
+	
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+	
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, GFP_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+	
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+	} else {
+		revents = file->f_op->poll(file, &ctl.pt);
+		if (revents & k->event.event) {
+			err = 1;
+			goto out_dequeue;
+		}
+	}
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	return 0;
+
+out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_LAST_CHECK) {
+		return 1;
+	} else {
+		struct file *file = k->st->origin;
+		unsigned int revents = file->f_op->poll(file, NULL);
+
+		k->event.ret_data[0] = revents & k->event.event;
+		
+		return (revents & k->event.event);
+	}
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks pc = {
+		.callback = &kevent_poll_callback,
+		.enqueue = &kevent_poll_enqueue,
+		.dequeue = &kevent_poll_dequeue,
+		.flags = 0,
+	};
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache",
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache",
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+
+	kevent_add_callbacks(&pc, KEVENT_POLL);
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 4/8] kevent: Socket notifications.
  2006-12-17 13:53       ` [take28-resend_2->0 3/8] kevent: poll/select() notifications Evgeniy Polyakov
@ 2006-12-17 13:53         ` Evgeniy Polyakov
  2006-12-17 13:53           ` [take28-resend_2->0 5/8] kevent: Timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Socket notifications.

This patch includes socket send/recv/accept notifications.
Using trivial web server based on kevent and this features
instead of epoll it's performance increased more than noticebly.
More details about various benchmarks and server itself 
(evserver_kevent.c) can be found on project's homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/inode.c b/fs/inode.c
index ada7643..2740617 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -164,12 +165,18 @@ static struct inode *alloc_inode(struct super_block *sb)
 		}
 		inode->i_private = 0;
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/include/net/sock.h b/include/net/sock.h
index edd4d73..d48ded8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -450,6 +451,21 @@ static inline int sk_stream_memory_free(struct sock *sk)
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -477,6 +493,7 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -679,21 +696,6 @@ static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0..69f4ad2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -857,6 +857,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..1798092
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,144 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	unsigned int events = SOCKET_I(inode)->ops->poll(SOCKET_I(inode)->file, SOCKET_I(inode), NULL);
+
+	if ((events & (POLLIN | POLLRDNORM)) && (k->event.event & (KEVENT_SOCKET_RECV | KEVENT_SOCKET_ACCEPT)))
+		return 1;
+	if ((events & (POLLOUT | POLLWRNORM)) && (k->event.event & KEVENT_SOCKET_SEND))
+		return 1;
+	if (events & (POLLERR | POLLHUP))
+		return -1;
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct inode *inode;
+	struct socket *sock;
+	int err = -EBADF;
+
+	sock = sockfd_lookup(k->event.id.raw[0], &err);
+	if (!sock)
+		goto err_out_exit;
+
+	inode = igrab(SOCK_INODE(sock));
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	sockfd_put(sock);
+err_out_exit:
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct socket *sock;
+
+	kevent_storage_dequeue(k->st, k);
+
+	sock = SOCKET_I(inode);
+	iput(inode);
+	sockfd_put(sock);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket)
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+}
+
+/*
+ * It is required for network protocols compiled as modules, like IPv6.
+ */
+EXPORT_SYMBOL_GPL(kevent_socket_notify);
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key kevent_sock_key;
+
+void kevent_socket_reinit(struct socket *sock)
+{
+	struct inode *inode = SOCK_INODE(sock);
+
+	lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+}
+
+void kevent_sk_reinit(struct sock *sk)
+{
+	if (sk->sk_socket) {
+		struct inode *inode = SOCK_INODE(sk->sk_socket);
+
+		lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+	}
+}
+#endif
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_socket_callback,
+		.enqueue = &kevent_socket_enqueue,
+		.dequeue = &kevent_socket_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SOCKET);
+}
+module_init(kevent_init_socket);
diff --git a/net/core/sock.c b/net/core/sock.c
index b77e155..7d5fa3e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1402,6 +1402,7 @@ static void sock_def_wakeup(struct sock *sk)
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1411,6 +1412,7 @@ static void sock_def_error_report(struct sock *sk)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1420,6 +1422,7 @@ static void sock_def_readable(struct sock *sk, int len)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1439,6 +1442,7 @@ static void sock_def_write_space(struct sock *sk)
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1489,6 +1493,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state		=	TCP_CLOSE;
 	sk->sk_socket		=	sock;
 
+	kevent_sk_reinit(sk);
+
 	sock_set_flag(sk, SOCK_ZAPPED);
 
 	if(sock)
@@ -1555,8 +1561,10 @@ void fastcall release_sock(struct sock *sk)
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f884ce..e7dd989 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3119,6 +3119,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c83938b..b0dd70d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -870,6 +871,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
diff --git a/net/socket.c b/net/socket.c
index 1bc4167..5582b4a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -85,6 +85,7 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -490,6 +491,8 @@ static struct socket *sock_alloc(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 
+	kevent_socket_reinit(sock);
+
 	get_cpu_var(sockets_in_use)++;
 	put_cpu_var(sockets_in_use);
 	return sock;


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 1/8] kevent: Description.
  2006-12-17 13:53 ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-12-17 13:53   ` Evgeniy Polyakov
  2006-12-17 13:53     ` [take28-resend_2->0 2/8] kevent: Core files Evgeniy Polyakov
  2006-12-19  3:47   ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Ulrich Drepper
  1 sibling, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Description.


diff --git a/Documentation/kevent.txt b/Documentation/kevent.txt
new file mode 100644
index 0000000..2e03a3f
--- /dev/null
+++ b/Documentation/kevent.txt
@@ -0,0 +1,240 @@
+Description.
+
+int kevent_init(struct kevent_ring *ring, unsigned int ring_size, 
+	unsigned int flags);
+
+num - size of the ring buffer in events 
+ring - pointer to allocated ring buffer
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value: kevent control file descriptor or negative error value.
+
+ struct kevent_ring
+ {
+   unsigned int ring_kidx, ring_over;
+   struct ukevent event[0];
+ }
+
+ring_kidx - index in the ring buffer where kernel will put new events 
+		when kevent_wait() or kevent_get_events() is called 
+ring_over - number of overflows of ring_uidx happend from the start.
+	Overflow counter is used to prevent situation when two threads 
+	are going to free the same events, but one of them was scheduled 
+	away for too long, so ring indexes were wrapped, so when that 
+	thread will be awakened, it will free not those events, which 
+	it suppose to free.
+
+Example userspace code (ring_buffer.c) can be found on project's homepage.
+
+Each kevent syscall can be so called cancellation point in glibc, i.e. when 
+thread has been cancelled in kevent syscall, thread can be safely removed 
+and no events will be lost, since each syscall (kevent_wait() or 
+kevent_get_events()) will copy event into special ring buffer, accessible 
+from other threads or even processes (if shared memory is used).
+
+When kevent is removed (not dequeued when it is ready, but just removed), 
+even if it was ready, it is not copied into ring buffer, since if it is 
+removed, no one cares about it (otherwise user would wait until it becomes 
+ready and got it through usual way using kevent_get_events() or kevent_wait()) 
+and thus no need to copy it to the ring buffer.
+
+-------------------------------------------------------------------------------
+
+
+int kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent *arg);
+
+fd - is the file descriptor referring to the kevent queue to manipulate. 
+It is created by opening "/dev/kevent" char device, which is created with 
+dynamic minor number and major number assigned for misc devices. 
+
+cmd - is the requested operation. It can be one of the following:
+    KEVENT_CTL_ADD - add event notification 
+    KEVENT_CTL_REMOVE - remove event notification 
+    KEVENT_CTL_MODIFY - modify existing notification 
+    KEVENT_CTL_READY - mark existing events as ready, if number of events is zero,
+    	it just wakes up parked in syscall thread
+
+num - number of struct ukevent in the array pointed to by arg 
+arg - array of struct ukevent
+
+Return value: 
+ number of events processed or negative error value.
+
+When called, kevent_ctl will carry out the operation specified in the 
+cmd parameter.
+-------------------------------------------------------------------------------
+
+ int kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, 
+ 		struct timespec timeout, struct ukevent *buf, unsigned flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+min_nr - minimum number of completed events that kevent_get_events will block 
+	 waiting for 
+max_nr - number of struct ukevent in buf 
+timeout - time to wait before returning less than min_nr 
+	  events. If this is -1, then wait forever. 
+buf - pointer to an array of struct ukevent. 
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied or negative error value.
+
+kevent_get_events will wait timeout milliseconds for at least min_nr completed 
+events, copying completed struct ukevents to buf and deleting any 
+KEVENT_REQ_ONESHOT event requests. In nonblocking mode it returns as many 
+events as possible, but not more than max_nr. In blocking mode it waits until 
+timeout or if at least min_nr events are ready.
+
+This function copies event into ring buffer if it was initialized, if ring buffer
+is full, KEVENT_RET_COPY_FAILED flag is set in ret_flags field.
+-------------------------------------------------------------------------------
+
+ int kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+ 	struct timespec timeout, unsigned int flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+num - number of processed kevents 
+old_uidx - the last index user is aware of
+timeout - time to wait until there is free space in kevent queue
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied into ring buffer or negative error value.
+
+This syscall waits until either timeout expires or at least one event becomes 
+ready. It also copies events into special ring buffer. If ring buffer is full,
+it waits until there are ready events and then return.
+If kevent is one-shot kevent it is removed in this syscall.
+If kevent is edge-triggered (KEVENT_REQ_ET flag is set in 'req_flags') it is 
+requeued in this syscall for performance reasons.
+-------------------------------------------------------------------------------
+
+ int kevent_commit(int ctl_fd, unsigned int new_idx, unsigned int over);
+
+ctl_fd - file descriptor referring to the kevent queue 
+new_uidx - the last committed kevent
+over - overflow count for given $new_idx value
+
+Return value:
+ number of committed kevents or negative error value.
+
+This function commits, i.e. marks as empty, slots in the ring buffer, so
+they can be reused when userspace completes that entries processing.
+
+Overflow counter is used to prevent situation when two threads are going 
+to free the same events, but one of them was scheduled away for too long, 
+so ring indexes were wrapped, so when that thread will be awakened, it 
+will free not those events, which it suppose to free.
+
+It is possible that returned number of committed events will be smaller than
+requested number - it is possible when several threads try to commit the
+same events.
+-------------------------------------------------------------------------------
+
+The bulk of the interface is entirely done through the ukevent struct. 
+It is used to add event requests, modify existing event requests, 
+specify which event requests to remove, and return completed events.
+
+struct ukevent contains the following members:
+
+struct kevent_id id
+    Id of this request, e.g. socket number, file descriptor and so on 
+__u32 type
+    Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on 
+__u32 event
+    Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED 
+__u32 req_flags
+    Per-event request flags,
+
+    KEVENT_REQ_ONESHOT
+        event will be removed when it is ready 
+
+    KEVENT_REQ_WAKEUP_ALL
+        Kevent wakes up only first thread interested in given event, 
+	or all threads if this flag is set.
+
+    KEVENT_REQ_ET
+        Edge Triggered behaviour. It is an optimisation which allows to move 
+	ready and dequeued (i.e. copied to userspace) event to move into set 
+	of interest for given storage (socket, inode and so on) again. It is 
+	very usefull for cases when the same event should be used many times 
+	(like reading from pipe). It is similar to epoll()'s EPOLLET flag. 
+
+    KEVENT_REQ_LAST_CHECK
+        if set allows to perform the last check on kevent (call appropriate 
+	callback) when kevent is marked as ready and has been removed from 
+	ready queue. If it will be confirmed that kevent is ready 
+	(k->callbacks.callback(k) returns true) then kevent will be copied 
+	to userspace, otherwise it will be requeued back to storage. 
+	Second (checking) call is performed with this bit cleared, so callback 
+	can detect when it was called from kevent_storage_ready() - bit is set, 
+	or kevent_dequeue_ready() - bit is cleared. If kevent will be requeued, 
+	bit will be set again.
+
+   KEVENT_REQ_ALWAYS_QUEUE
+        If this flag is set kevent will be queued into ready queue if it is 
+	ready at enqueue time, otherwise it will be copied back to userspace
+	and will not be queued into the storage.
+
+__u32 ret_flags
+    Per-event return flags
+
+    KEVENT_RET_BROKEN
+        Kevent is broken 
+
+    KEVENT_RET_DONE
+        Kevent processing was finished successfully 
+
+    KEVENT_RET_COPY_FAILED
+        Kevent was not copied into ring buffer due to some error conditions. 
+
+__u32 ret_data
+    Event return data. Event originator fills it with anything it likes 
+    (for example timer notifications put number of milliseconds when timer 
+    has fired 
+union { __u32 user[2]; void *ptr; }
+    User's data. It is not used, just copied to/from user. The whole structure 
+    is aligned to 8 bytes already, so the last union is aligned properly. 
+
+-------------------------------------------------------------------------------
+
+Kevent waiting syscall flags.
+
+KEVENT_FLAGS_ABSTIME - provided timespec parameter contains absolute time, 
+	for example Aug 27, 2194, or time(NULL) + 10.
+
+-------------------------------------------------------------------------------
+
+Usage
+
+For KEVENT_CTL_ADD, all fields relevant to the event type must be filled 
+(id, type, event, req_flags). 
+After kevent_ctl(..., KEVENT_CTL_ADD, ...) returns each struct's ret_flags 
+should be checked to see if the event is already broken or done.
+
+For KEVENT_CTL_MODIFY, the id, req_flags, and user and event fields must be 
+set and an existing kevent request must have matching id and user fields. If 
+match is found, req_flags and event are replaced with the newly supplied 
+values and requeueing is started, so modified kevent can be checked and 
+probably marked as ready immediately. If a match can't be found, the 
+passed in ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is 
+always set.
+
+For KEVENT_CTL_REMOVE, the id and user fields must be set and an existing 
+kevent request must have matching id and user fields. If a match is found, 
+the kevent request is removed. If a match can't be found, the passed in 
+ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is always set.
+
+For kevent_get_events, the entire structure is returned.
+
+-------------------------------------------------------------------------------
+
+Usage cases
+
+kevent_timer
+struct ukevent should contain following fields:
+    type - KEVENT_TIMER 
+    event - KEVENT_TIMER_FIRED 
+    req_flags - KEVENT_REQ_ONESHOT if you want to fire that timer only once 
+    id.raw[0] - number of seconds after commit when this timer shout expire 
+    id.raw[0] - additional to number of seconds number of nanoseconds 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 2/8] kevent: Core files.
  2006-12-17 13:53   ` [take28-resend_2->0 1/8] kevent: Description Evgeniy Polyakov
@ 2006-12-17 13:53     ` Evgeniy Polyakov
  2006-12-17 13:53       ` [take28-resend_2->0 3/8] kevent: poll/select() notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Core files.

This patch includes core kevent files:
 * userspace controlling
 * kernelspace interfaces
 * initialization
 * notification state machines

Some bits of documentation can be found on project's homepage (and links from there):
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 7e639f7..a6221c2 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -318,3 +318,8 @@ ENTRY(sys_call_table)
 	.long sys_vmsplice
 	.long sys_move_pages
 	.long sys_getcpu
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl		/* 320 */
+	.long sys_kevent_wait
+	.long sys_kevent_commit
+	.long sys_kevent_init
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index b4aa875..dda2168 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -714,8 +714,13 @@ ia32_sys_call_table:
 	.quad compat_sys_get_robust_list
 	.quad sys_splice
 	.quad sys_sync_file_range
-	.quad sys_tee
+	.quad sys_tee			/* 315 */
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
 	.quad sys_getcpu
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl		/* 320 */
+	.quad sys_kevent_wait
+	.quad sys_kevent_commit
+	.quad sys_kevent_init
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index bd99870..57a6b8c 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -324,10 +324,15 @@
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
 #define __NR_getcpu		318
+#define __NR_kevent_get_events	319
+#define __NR_kevent_ctl		320
+#define __NR_kevent_wait	321
+#define __NR_kevent_commit	322
+#define __NR_kevent_init	323
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 319
+#define NR_syscalls 324
 #include <linux/err.h>
 
 /*
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 6137146..17d750d 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,20 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait	282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
+#define __NR_kevent_commit	283
+__SYSCALL(__NR_kevent_commit, sys_kevent_commit)
+#define __NR_kevent_init	284
+__SYSCALL(__NR_kevent_init, sys_kevent_init)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_init
 #include <linux/err.h>
 
 #ifndef __NO_STUBS
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..2c31df3
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,244 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
+#include <linux/ukevent.h>
+
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+/* @flags - flags for given set of callbacks. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+	unsigned int		flags;
+};
+
+#define KEVENT_CALLBACKS_KERNELONLY	0x1
+
+int kevent_event_is_allowed(struct ukevent *e);
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	/* Used for kevent freeing.*/
+	struct rcu_head		rcu_head;
+	struct ukevent		event;
+	/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+	spinlock_t		ulock;
+
+	/* Entry of user's tree. */
+	struct rb_node		kevent_node;
+	/* Entry of origin's queue. */
+	struct list_head	storage_entry;
+	/* Entry of user's ready. */
+	struct list_head	ready_entry;
+
+	u32			flags;
+
+	/* User who requested this kevent. */
+	struct kevent_user	*user;
+	/* Kevent container. */
+	struct kevent_storage	*st;
+
+	struct kevent_callbacks	callbacks;
+
+	/* Private data for different storages.
+	 * poll()/select storage has a list of wait_queue_t containers
+	 * for each ->poll() { poll_wait()' } here.
+	 */
+	void			*priv;
+};
+
+struct kevent_user
+{
+	struct rb_root		kevent_root;
+	spinlock_t		kevent_lock;
+	/* Number of queued kevents. */
+	unsigned int		kevent_num;
+
+	/* List of ready kevents. */
+	struct list_head	ready_list;
+	/* Number of ready kevents. */
+	unsigned int		ready_num;
+	/* Protects all manipulations with ready queue. */
+	spinlock_t 		ready_lock;
+
+	/* Protects against simultaneous kevent_user control manipulations. */
+	struct mutex		ctl_mutex;
+	/* Wait until some events are ready. */
+	wait_queue_head_t	wait;
+	/* Exit from syscall if someone wants us to do it */
+	int			need_exit;
+
+	/* Reference counter, increased for each new kevent. */
+	atomic_t		refcnt;
+
+	/* Mutex protecting userspace ring buffer. */
+	struct mutex		ring_lock;
+	/* Kernel index and size of the userspace ring buffer. */
+	unsigned int		kidx, uidx, ring_size, ring_over, full;
+	/* Pointer to userspace ring buffer. */
+	struct kevent_ring __user *pring;
+
+	/* Is used for absolute waiting times. */
+	struct hrtimer		timer;
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num, ring_num;
+	unsigned long		total;
+#endif
+};
+
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos);
+
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+void kevent_ready(struct kevent *k, int ret);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = u->ring_num = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	printk(KERN_INFO "%s: u: %p, wait: %lu, ring: %lu, immediately: %lu, total: %lu.\n",
+			__func__, u, u->wait_num, u->ring_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_ring(struct kevent_user *u)
+{
+	u->ring_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_ring(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+#ifdef CONFIG_KEVENT_SOCKET
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_POLL
+static inline void kevent_init_file(struct file *file)
+{
+	kevent_storage_init(file, &file->st);
+}
+
+static inline void kevent_cleanup_file(struct file *file)
+{
+	kevent_storage_fini(&file->st);
+}
+#else
+static inline void kevent_init_file(struct file *file) {}
+static inline void kevent_cleanup_file(struct file *file) {}
+#endif
+
+#ifdef CONFIG_KEVENT_PIPE
+extern void kevent_pipe_notify(struct inode *inode, u32 events);
+#else
+static inline void kevent_pipe_notify(struct inode *inode, u32 events) {}
+#endif
+
+#ifdef CONFIG_KEVENT_SIGNAL
+extern int kevent_signal_notify(struct task_struct *tsk, int sig);
+#else
+static inline int kevent_signal_notify(struct task_struct *tsk, int sig) {return 0;}
+#endif
+
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2d1c3d5..7574ec3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,8 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct ukevent;
+struct kevent_ring;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -599,4 +601,11 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, struct ukevent __user *buf);
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags);
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over);
+asmlinkage long sys_kevent_init(int ctl_fd, struct kevent_ring __user *ring, unsigned int num, unsigned int flags);
 #endif
diff --git a/include/linux/ukevent.h b/include/linux/ukevent.h
new file mode 100644
index 0000000..01932d0
--- /dev/null
+++ b/include/linux/ukevent.h
@@ -0,0 +1,185 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __UKEVENT_H
+#define __UKEVENT_H
+
+#include <linux/types.h>
+
+/*
+ * Kevent request flags.
+ */
+
+/* Process this event only once and then remove it. */
+#define KEVENT_REQ_ONESHOT	0x1
+/* Kevent wakes up only first thread interested in given event,
+ * or all threads if this flag is set.
+ */
+#define KEVENT_REQ_WAKEUP_ALL	0x2
+/* Edge Triggered behaviour. */
+#define KEVENT_REQ_ET		0x4
+/* Perform the last check on kevent (call appropriate callback) when
+ * kevent is marked as ready and has been removed from ready queue.
+ * If it will be confirmed that kevent is ready 
+ * (k->callbacks.callback(k) returns true) then kevent will be copied
+ * to userspace, otherwise it will be requeued back to storage. 
+ * Second (checking) call is performed with this bit _cleared_ so
+ * callback can detect when it was called from 
+ * kevent_storage_ready() - bit is set, or 
+ * kevent_dequeue_ready() - bit is cleared. 
+ * If kevent will be requeued, bit will be set again. */
+#define KEVENT_REQ_LAST_CHECK	0x8
+/*
+ * Always queue kevent even if it is immediately ready.
+ */
+#define KEVENT_REQ_ALWAYS_QUEUE	0x16
+
+/*
+ * Kevent return flags.
+ */
+/* Kevent is broken. */
+#define KEVENT_RET_BROKEN	0x1
+/* Kevent processing was finished successfully. */
+#define KEVENT_RET_DONE		0x2
+/* Kevent was not copied into ring buffer due to some error conditions. */
+#define KEVENT_RET_COPY_FAILED	0x4
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define KEVENT_PIPE		6
+#define KEVENT_SIGNAL		7
+#define KEVENT_POSIX_TIMER	8
+
+/* Used as array size, so must be equal to the last KEVENT type + 1 */
+#define	KEVENT_MAX		9
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO and PIPE events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+/*
+ * Signal events.
+ */
+#define KEVENT_SIGNAL_DELIVERY		0x1
+
+/* If set in raw64, then given signals will not be delivered
+ * in a usual way through sigmask update and signal callback 
+ * invocation. */
+#define KEVENT_SIGNAL_NOMASK	0x8000000000000000ULL
+
+/* Mask of all possible event values. */
+#define KEVENT_MASK_ALL		0xffffffff
+/* Empty mask of ready events. */
+#define KEVENT_MASK_EMPTY	0x0
+
+struct kevent_id
+{
+	union {
+		__u32		raw[2];
+		__u64		raw_u64 __attribute__((aligned(8)));
+	};
+};
+
+struct ukevent
+{
+	/* Id of this request, e.g. socket number, file descriptor and so on... */
+	struct kevent_id	id;
+	/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			type;
+	/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			event;
+	/* Per-event request flags */
+	__u32			req_flags;
+	/* Per-event return flags */
+	__u32			ret_flags;
+	/* Event return data. Event originator fills it with anything it likes. */
+	__u32			ret_data[2];
+	/* User's data. It is not used, just copied to/from user.
+	 * The whole structure is aligned to 8 bytes already, so the last union
+	 * is aligned properly.
+	 */
+	union {
+		__u32		user[2];
+		void		*ptr;
+	};
+};
+
+struct kevent_ring
+{
+	unsigned int		ring_kidx, ring_over;
+	struct ukevent		event[0];
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_READY	3
+
+/* Provided timespec parameter uses absolute time, i.e. 'wait until Aug 27, 2194' */
+#define KEVENT_FLAGS_ABSTIME	1
+
+#endif /* __UKEVENT_H */
diff --git a/init/Kconfig b/init/Kconfig
index d2eb7a8..c7d8250 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -201,6 +201,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..c6cdb0a
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,66 @@
+config KEVENT
+	bool "Kernel event notification mechanism" if EMBEDDED
+	default y
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback
+	  invocations, advanced timer notifications and other kernel
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents
+	  which are ready immediately at insertion time and number of kevents
+	  which were removed through readiness completion.
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use kevent subsystem for poll()/select()
+	  notifications.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+
+config KEVENT_PIPE
+	bool "Kernel event notifications for pipes"
+	depends on KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  pipe read/write operations.
+
+config KEVENT_SIGNAL
+	bool "Kernel event notifications for signals"
+	depends on KEVENT
+	default y
+	help
+	  This option enables signal delivery through KEVENT subsystem.
+	  Signals which were requested to be delivered through kevent
+	  subsystem must be registered through usual signal() and others
+	  syscalls, this option allows alternative delivery.
+	  With KEVENT_SIGNAL_NOMASK flag being set in kevent for set of 
+	  signals, they will not be delivered in a usual way.
+	  Kevents for appropriate signals are not copied when process forks,
+	  new process must add new kevents after fork(). Mask of signals
+	  is copied as before.
+
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..f98e0c8
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,6 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_PIPE) += kevent_pipe.o
+obj-$(CONFIG_KEVENT_SIGNAL) += kevent_signal.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..1781c57
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,265 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return -EINVAL;
+}
+
+static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX] __read_mostly;
+
+int kevent_event_is_allowed(struct ukevent *e)
+{
+	if (unlikely(e->type >= KEVENT_MAX))
+		return 0;
+
+	if (!kevent_registered_callbacks[e->type].callback)
+		return 0;
+
+	if (unlikely(kevent_registered_callbacks[e->type].callback == kevent_break))
+		return 0;
+
+	if (kevent_registered_callbacks[e->type].flags & KEVENT_CALLBACKS_KERNELONLY)
+		return 0;
+	
+	return 1;
+}
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos)
+{
+	struct kevent_callbacks *p;
+
+	if (pos >= KEVENT_MAX)
+		return -EINVAL;
+
+	p = &kevent_registered_callbacks[pos];
+
+	p->enqueue = (cb->enqueue) ? cb->enqueue : kevent_break;
+	p->dequeue = (cb->dequeue) ? cb->dequeue : kevent_break;
+	p->callback = (cb->callback) ? cb->callback : kevent_break;
+	p->flags = cb->flags;
+
+	printk(KERN_INFO "KEVENT: Added callbacks for type %d.\n", pos);
+	return 0;
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (unlikely(k->event.type >= KEVENT_MAX)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	if (!kevent_registered_callbacks[k->event.type].callback) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (unlikely(k->callbacks.callback == kevent_break)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue.
+ * It does not decrease origin's reference counter in any way
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+void kevent_ready(struct kevent *k, int ret)
+{
+	unsigned long flags;
+	int rem;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0)
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	else if (ret < 0)
+		k->event.ret_flags |= (KEVENT_RET_BROKEN | KEVENT_RET_DONE);
+	else
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && (k->flags & KEVENT_STORAGE)) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static int __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret;
+
+	ret = k->callbacks.callback(k);
+
+	kevent_ready(k, ret);
+
+	return ret;
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+	int wake_num = 0;
+
+	rcu_read_lock();
+	if (unlikely(ready_callback))
+		list_for_each_entry_rcu(k, &st->list, storage_entry)
+			(*ready_callback)(k);
+
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (event & k->event.event)
+			if ((k->event.req_flags & KEVENT_REQ_WAKEUP_ALL) || wake_num == 0)
+				if (__kevent_requeue(k, event))
+					wake_num++;
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, socket and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..7bd3d7a
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1360 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static kmem_cache_t *kevent_cache __read_mostly;
+static kmem_cache_t *kevent_user_cache __read_mostly;
+
+static int kevent_debug_abstime;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num || u->need_exit)
+		mask |= POLLIN | POLLRDNORM;
+	u->need_exit = 0;
+
+	return mask;
+}
+
+static inline unsigned int kevent_ring_space(struct kevent_user *u)
+{
+	if (u->full)
+		return 0;
+
+	return (u->uidx > u->kidx)?
+		(u->uidx - u->kidx):
+		(u->ring_size - (u->kidx - u->uidx));
+}
+
+static inline int kevent_ring_index_inc(unsigned int *pidx, unsigned int size)
+{
+	unsigned int idx = *pidx;
+
+	if (++idx >= size)
+		idx = 0;
+	*pidx = idx;
+	return (idx == 0);
+}
+
+/*
+ * Copies kevent into userspace ring buffer if it was initialized.
+ * Returns 
+ *  0 on success or if ring buffer is not used
+ *  -EAGAIN if there were no place for that kevent
+ *  -EFAULT if copy_to_user() failed.
+ *
+ *  Must be called under kevent_user->ring_lock locked.
+ */
+static int kevent_copy_ring_buffer(struct kevent *k)
+{
+	struct kevent_ring __user *ring;
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+	int err;
+
+	ring = u->pring;
+	if (!ring)
+		return 0;
+
+	if (!kevent_ring_space(u))
+		return -EAGAIN;
+
+	if (copy_to_user(&ring->event[u->kidx], &k->event, sizeof(struct ukevent))) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	kevent_ring_index_inc(&u->kidx, u->ring_size);
+
+	if (u->kidx == u->uidx)
+		u->full = 1;
+
+	if (put_user(u->kidx, &ring->ring_kidx)) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return err;
+}
+
+static struct kevent_user *kevent_user_alloc(struct kevent_ring __user *ring, unsigned int num)
+{
+	struct kevent_user *u;
+
+	u = kmem_cache_alloc(kevent_user_cache, GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	u->kevent_root = RB_ROOT;
+
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+	u->need_exit = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	mutex_init(&u->ring_lock);
+	u->kidx = u->uidx = u->ring_over = u->full = 0;
+
+	u->pring = ring;
+	u->ring_size = num;
+
+	hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+
+	return u;
+}
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		hrtimer_cancel(&u->timer);
+		kmem_cache_free(kevent_user_cache, u);
+	}
+}
+
+static inline int kevent_compare_id(struct kevent_id *left, struct kevent_id *right)
+{
+	if (left->raw_u64 > right->raw_u64)
+		return -1;
+
+	if (right->raw_u64 > left->raw_u64)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Must be called under u->ready_lock.
+ * This function unlinks kevent from ready queue.
+ */
+static inline void kevent_unlink_ready(struct kevent *k)
+{
+	list_del(&k->ready_entry);
+	k->flags &= ~KEVENT_READY;
+	k->user->ready_num--;
+}
+
+static void kevent_remove_ready(struct kevent *k)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY)
+		kevent_unlink_ready(k);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	if (deq)
+		kevent_dequeue(k);
+
+	kevent_remove_ready(k);
+
+	kevent_user_put(k->user);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events,
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+static struct kevent *__kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	if (u->ready_num) {
+		spin_lock_irqsave(&u->ready_lock, flags);
+		if (u->ready_num && !list_empty(&u->ready_list)) {
+			k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+			kevent_unlink_ready(k);
+		}
+		spin_unlock_irqrestore(&u->ready_lock, flags);
+	}
+
+	return k;
+}
+
+static struct kevent *kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	while (u->ready_num && !k) {
+		k = __kevent_dequeue_ready_one(u);
+
+		if (k && (k->event.req_flags & KEVENT_REQ_LAST_CHECK)) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&k->ulock, flags);
+			k->event.req_flags &= ~KEVENT_REQ_LAST_CHECK;
+			spin_unlock_irqrestore(&k->ulock, flags);
+
+			if (!k->callbacks.callback(k)) {
+				spin_lock_irqsave(&k->ulock, flags);
+				k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+				k->event.ret_flags = 0;
+				k->event.ret_data[0] = k->event.ret_data[1] = 0;
+				spin_unlock_irqrestore(&k->ulock, flags);
+				k = NULL;
+			}
+		} else
+			break;
+	}
+
+	return k;
+}
+
+static inline void kevent_copy_ring(struct kevent *k)
+{
+	unsigned long flags;
+
+	if (!k)
+		return;
+
+	if (kevent_copy_ring_buffer(k)) {
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kevent_dequeue_ready(struct kevent_user *u)
+{
+	struct kevent *k;
+
+	mutex_lock(&u->ring_lock);
+	k = kevent_dequeue_ready_one(u);
+	kevent_copy_ring(k);
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+/*
+ * Dequeue one entry from user's ready queue if there is space in ring buffer.
+ */
+static struct kevent *kevent_dequeue_ready_ring(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	mutex_lock(&u->ring_lock);
+	if (kevent_ring_space(u)) {
+		k = kevent_dequeue_ready_one(u);
+		kevent_copy_ring(k);
+	}
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+static void kevent_complete_ready(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		kevent_finish_user(k, 1);
+	else if (k->event.req_flags & KEVENT_REQ_ET) {
+		unsigned long flags;
+
+		/*
+		 * Edge-triggered behaviour: mark event as clear new one.
+		 */
+
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags = 0;
+		k->event.ret_data[0] = k->event.ret_data[1] = 0;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Search a kevent inside kevent tree for given ukevent.
+ */
+static struct kevent *__kevent_search(struct kevent_id *id, struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	struct rb_node *n = u->kevent_root.rb_node;
+	int cmp;
+
+	while (n) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		cmp = kevent_compare_id(&k->event.id, id);
+
+		if (cmp > 0)
+			n = n->rb_right;
+		else if (cmp < 0)
+			n = n->rb_left;
+		else {
+			ret = k;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k;
+	struct rb_node *n;
+
+	for (n = rb_first(&u->kevent_root); n; n = rb_next(n)) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_mark_ready(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		spin_lock(&k->st->lock);
+		kevent_ready(k, 1);
+		spin_unlock(&k->st->lock);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Mark appropriate kevents as ready.
+ * If number of events is zero just wake up one listener.
+ */
+static int kevent_user_ctl_ready(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	if (num > u->kevent_num)
+		return err;
+	
+	if (!num) {
+		u->need_exit = 1;
+		wake_up(&u->wait);
+		return 0;
+	}
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				err = kevent_mark_ready(&ukev[i], u);
+				if (err) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_mark_ready(&uk, u);
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = num - rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static int kevent_user_enqueue(struct kevent_user *u, struct kevent *new)
+{
+	unsigned long flags;
+	struct rb_node **p = &u->kevent_root.rb_node, *parent = NULL;
+	struct kevent *k;
+	int err = 0, cmp;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	while (*p) {
+		parent = *p;
+		k = rb_entry(parent, struct kevent, kevent_node);
+
+		cmp = kevent_compare_id(&k->event.id, &new->event.id);
+		if (cmp > 0)
+			p = &parent->rb_right;
+		else if (cmp < 0)
+			p = &parent->rb_left;
+		else {
+			err = -EEXIST;
+			break;
+		}
+	}
+	if (likely(!err)) {
+		rb_link_node(&new->kevent_node, parent, p);
+		rb_insert_color(&new->kevent_node, &u->kevent_root);
+		new->flags |= KEVENT_USER;
+		u->kevent_num++;
+		kevent_user_get(u);
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	err = kevent_user_enqueue(u, k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		kevent_finish_user(k, 0);
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	if (err < 0) {
+		uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE;
+		uk->ret_data[1] = err;
+	} else if (err > 0)
+		uk->ret_flags |= KEVENT_RET_DONE;
+	return err;
+}
+
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one
+ * and add them into appropriate kevent_storages,
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (!kevent_event_is_allowed(&ukev[i])) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], 
+							&ukev[i], 
+							sizeof(struct ukevent));
+					rnum++;
+				} else {
+					err = kevent_user_add_ukevent(&ukev[i], u);
+					if (err) {
+						kevent_stat_im(u);
+						if (i != rnum)
+							memcpy(&ukev[rnum], 
+								&ukev[i], 
+								sizeof(struct ukevent));
+						rnum++;
+					}
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		if (!kevent_event_is_allowed(&uk)) {
+			err = 1;
+		} else {
+			err = kevent_user_add_ukevent(&uk, u);
+			if (err)
+				kevent_stat_im(u);
+		}
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/* Used to wakeup waiting syscalls in case high-resolution timer is used. */
+static int kevent_user_wake(struct hrtimer *timer)
+{
+	struct kevent_user *u = container_of(timer, struct kevent_user, timer);
+
+	u->need_exit = 1;
+	wake_up(&u->wait);
+
+	return HRTIMER_NORESTART;
+}
+
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u,
+		unsigned int min_nr, unsigned int max_nr, struct timespec timeout,
+		void __user *buf, unsigned int flags)
+{
+	struct kevent *k;
+	int num = 0;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			return -EINVAL;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || u->need_exit, tm);
+	}
+	u->need_exit = 0;
+
+	while (num < max_nr && ((k = kevent_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent),
+					&k->event, sizeof(struct ukevent))) {
+			if (num == 0)
+				num = -EFAULT;
+			break;
+		}
+		kevent_complete_ready(k);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+struct file_operations kevent_user_fops = {
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	case KEVENT_CTL_READY:
+		err = kevent_user_ctl_ready(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - time to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf, flags);
+out_fput:
+	fput(file);
+	return err;
+}
+
+static struct vfsmount *kevent_mnt __read_mostly;
+
+static int kevent_get_sb(struct file_system_type *fs_type, int flags,
+		   const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "kevent", NULL, 0xaabbccdd, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= "keventfs",
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int keventfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations keventfs_dentry_operations = {
+	.d_delete	= keventfs_delete_dentry,
+};
+
+asmlinkage long sys_kevent_init(struct kevent_ring __user *ring, unsigned int num, unsigned int flags)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int err = -ENFILE, fd;
+	struct kevent_user *u;
+
+	if ((ring && !num) || (!ring && num) || (num == 1))
+		return -EINVAL;
+
+	file = get_empty_filp();
+	if (!file)
+		goto err_out_exit;
+
+	inode = new_inode(kevent_mnt->mnt_sb);
+	if (!inode)
+		goto err_out_fput;
+
+	inode->i_fop = &kevent_user_fops;
+
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	err = get_unused_fd();
+	if (err < 0)
+		goto err_out_iput;
+	fd = err;
+
+	err = -ENOMEM;
+	u = kevent_user_alloc(ring, num);
+	if (!u)
+		goto err_out_put_fd;
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(kevent_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_out_free;
+	dentry->d_op = &keventfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_pos = 0;
+	file->f_flags = O_RDONLY;
+	file->f_op = &kevent_user_fops;
+	file->f_mode = FMODE_READ;
+	file->f_version = 0;
+	file->private_data = u;
+
+	fd_install(fd, file);
+
+	return fd;
+
+err_out_free:
+	kmem_cache_free(kevent_user_cache, u);
+err_out_put_fd:
+	put_unused_fd(fd);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	put_filp(file);
+err_out_exit:
+	return err;
+}
+
+/*
+ * Commits user's index (consumer index).
+ * Must be called under u->ring_lock mutex held.
+ */
+static int __kevent_user_commit(struct kevent_user *u, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EOVERFLOW, comm = 0;
+	struct kevent_ring __user *ring = u->pring;
+
+	if (!ring) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	if (new_uidx >= u->ring_size) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if ((over != u->ring_over - 1) && (over != u->ring_over))
+		goto err_out_exit;
+
+	if (u->uidx < u->kidx && new_uidx > u->kidx) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if (new_uidx > u->uidx) {
+		if (over != u->ring_over)
+			goto err_out_exit;
+
+		comm = new_uidx - u->uidx;
+		u->uidx = new_uidx;
+		u->full = 0;
+	} else if (new_uidx < u->uidx) {
+		comm = u->ring_size - (u->uidx - new_uidx);
+		u->uidx = new_uidx;
+		u->full = 0;
+		u->ring_over++;
+
+		if (put_user(u->ring_over, &ring->ring_over)) {
+			err = -EFAULT;
+			goto err_out_exit;
+		}
+	}
+
+	return comm;
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * This syscall is used to perform waiting until there is free space in the ring
+ * buffer, in that case some events will be copied there.
+ * Function returns number of actually copied ready events in ring buffer.
+ * After this function is completed userspace ring->ring_kidx will be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @num - number of kevents to process.
+ * @old_uidx - the last index user is aware of.
+ * @timeout - time to wait until there is free space in kevent queue.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ *
+ * When we need to commit @num events, it means we should just remove first @num
+ * kevents from ready queue and copy them into the buffer. 
+ * Kevents will be copied into ring buffer in order they were placed into ready queue.
+ * One-shot kevents will be removed here, since there is no way they can be reused.
+ * Edge-triggered events will be requeued here for better performance.
+ */
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags)
+{
+	int err = -EINVAL, copied = 0;
+	struct file *file;
+	struct kevent_user *u;
+	struct kevent *k;
+	struct kevent_ring __user *ring;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+	unsigned int i;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	ring = u->pring;
+	if (!ring || num > u->ring_size)
+		goto out_fput;
+#if 0
+	/*
+	 * Allow to immediately update ring index, but it is not supported,
+	 * since syscall() has limited number of arguments which is actually
+	 * a good idea - use kevent_commit() instead.
+	 */
+	if ((u->uidx != new_uidx) && (new_uidx != 0xffffffff)) {
+		mutex_lock(&u->ring_lock);
+		__kevent_user_commit(u, new_uidx, over);
+		mutex_unlock(&u->ring_lock);
+	}
+#endif
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			goto out_fput;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || 
+				u->need_exit || old_uidx != u->uidx,
+				tm);
+	}
+	u->need_exit = 0;
+
+	for (i=0; i<num; ++i) {
+		k = kevent_dequeue_ready_ring(u);
+		if (!k)
+			break;
+		kevent_complete_ready(k);
+
+		if (k->event.ret_flags & KEVENT_RET_COPY_FAILED)
+			break;
+		kevent_stat_ring(u);
+		copied++;
+	}
+
+	fput(file);
+
+	return copied;
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to commit events in ring buffer, i.e. mark appropriate
+ * entries as unused by userspace so subsequent kevent_wait() could overwrite them.
+ * This fucntion returns actual number of kevents which were committed.
+ * After this function is completed userspace ring->ring_over can be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @new_uidx - the last committed kevent.
+ * @over - number of overflows given queue had.
+ */
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EINVAL, comm = 0;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	mutex_lock(&u->ring_lock);
+	err = __kevent_user_commit(u, new_uidx, over);
+	if (err < 0)
+		goto err_out_unlock;
+	comm = err;
+	mutex_unlock(&u->ring_lock);
+
+	fput(file);
+
+	return comm;
+
+err_out_unlock:
+	mutex_unlock(&u->ring_lock);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create caches and register
+ * filesystem to get control file descriptors from.
+ */
+static int __init kevent_user_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache",
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+	
+	kevent_user_cache = kmem_cache_create("kevent_user_cache",
+			sizeof(struct kevent_user), 0, SLAB_PANIC, NULL, NULL);
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		goto err_out_exit;
+	
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	err = PTR_ERR(kevent_mnt);
+	if (IS_ERR(kevent_mnt))
+		goto err_out_unreg;
+
+	printk(KERN_INFO "KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_unreg:
+	unregister_filesystem(&kevent_fs_type);
+err_out_exit:
+	kmem_cache_destroy(kevent_cache);
+	return err;
+}
+
+module_init(kevent_user_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7a3b2e7..3b7d35f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,12 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+cond_syscall(sys_kevent_wait);
+cond_syscall(sys_kevent_commit);
+cond_syscall(sys_kevent_init);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
       [not found] <3154985aa0591036@2ka.mipt.ru>
@ 2006-12-17 13:53 ` Evgeniy Polyakov
  2006-12-17 13:53   ` [take28-resend_2->0 1/8] kevent: Description Evgeniy Polyakov
  2006-12-19  3:47   ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Ulrich Drepper
  2006-12-21  9:14 ` [take28-resend_1->0 " Evgeniy Polyakov
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Generic event handling mechanism.

Kevent is a generic subsytem which allows to handle event notifications.
It supports both level and edge triggered events. It is similar to
poll/epoll in some cases, but it is more scalable, it is faster and
allows to work with essentially eny kind of events.

Events are provided into kernel through control syscall and can be read
back through ring buffer or using usual syscalls.
Kevent update (i.e. readiness switching) happens directly from internals
of the appropriate state machine of the underlying subsytem (like
network, filesystem, timer or any other).

Homepage:
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Documentation page:
http://linux-net.osdl.org/index.php/Kevent

Consider for inclusion.

New benchmark, which can be a hoax though, can be found at 
http://tservice.net.ru/~s0mbre/blog/2006/11/30#2006_11_30
where kevent on amd64 with 1gb of ram can handle more than 7200 events per 
second with 8000 requests concurrency with 'ab' benchmark and lighttpd.
Although I tought it should not be published due to possible errors,
I decided to send it for review.

With this release I start 3 days resending timeout - i.e. each third day I 
will send either new version (if something new was requested and agreed to 
be implemented) or resending with back counter started from three. 
When back counter hits zero after three resending I consider there is no 
interest in subsystem and I will stop further sending.

Thanks for understanding and your time.

Changes from 'take27' patchset:
 * made kevent default yes in non embedded case.
 * added falgs to callback structures - currently used to check if kevent
 	can be requested from kernelspace only (posix timers) or 
	userspace (all others)

Changes from 'take26' patchset:
 * made kevent visible in config only in case of embedded setup.
 * added comment about KEVENT_MAX number.
 * spell fix.

Changes from 'take25' patchset:
 * use timespec as timeout parameter.
 * added high-resolution timer to handle absolute timeouts.
 * added flags to waiting and initialization syscalls.
 * kevent_commit() has new_uidx parameter.
 * kevent_wait() has old_uidx parameter, which, if not equal to u->uidx,
 	results in immediate wakeup (usefull for the case when entries
	are added asynchronously from kernel (not supported for now)).
 * added interface to mark any event as ready.
 * event POSIX timers support.
 * return -ENOSYS if there is no registered event type.
 * provided file descriptor must be checked for fifo type (spotted by Eric Dumazet).
 * signal notifications.
 * documentation update.
 * lighttpd patch updated (the latest benchmarks with lighttpd patch can be found in blog).

Changes from 'take24' patchset:
 * new (old (new)) ring buffer implementation with kernel and user indexes.
 * added initialization syscall instead of opening /dev/kevent
 * kevent_commit() syscall to commit ring buffer entries
 * changed KEVENT_REQ_WAKEUP_ONE flag to KEVENT_REQ_WAKEUP_ALL, kevent wakes
   only first thread always if that flag is not set
 * KEVENT_REQ_ALWAYS_QUEUE flag. If set, kevent will be queued into ready queue
   instead of copying back to userspace when kevent is ready immediately when
   it is added.
 * lighttpd patch (Hail! Although nothing really outstanding compared to epoll)

Changes from 'take23' patchset:
 * kevent PIPE notifications
 * KEVENT_REQ_LAST_CHECK flag, which allows to perform last check at dequeueing time
 * fixed poll/select notifications (were broken due to tree manipulations)
 * made Documentation/kevent.txt look nice in 80-col terminal
 * fix for copy_to_user() failure report for the first kevent (Andrew Morton)
 * minor function renames

Changes from 'take22' patchset:
 * new ring buffer implementation in process' memory
 * wakeup-one-thread flag
 * edge-triggered behaviour

Changes from 'take21' patchset:
 * minor cleanups (different return values, removed unneded variables, whitespaces and so on)
 * fixed bug in kevent removal in case when kevent being removed
   is the same as overflow_kevent (spotted by Eric Dumazet)

Changes from 'take20' patchset:
 * new ring buffer implementation
 * removed artificial limit on possible number of kevents

Changes from 'take19' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take18' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take17' patchset:
 * Use RB tree instead of hash table. 
	At least for a web sever, frequency of addition/deletion of new kevent 
	is comparable with number of search access, i.e. most of the time events 
	are added, accesed only couple of times and then removed, so it justifies 
	RB tree usage over AVL tree, since the latter does have much slower deletion 
	time (max O(log(N)) compared to 3 ops), 
	although faster search time (1.44*O(log(N)) vs. 2*O(log(N))). 
	So for kevents I use RB tree for now and later, when my AVL tree implementation 
	is ready, it will be possible to compare them.
 * Changed readiness check for socket notifications.

With both above changes it is possible to achieve more than 3380 req/second compared to 2200, 
sometimes 2500 req/second for epoll() for trivial web-server and httperf client on the same
hardware.
It is possible that above kevent limit is due to maximum allowed kevents in a time limit, which is
4096 events.

Changes from 'take16' patchset:
 * misc cleanups (__read_mostly, const ...)
 * created special macro which is used for mmap size (number of pages) calculation
 * export kevent_socket_notify(), since it is used in network protocols which can be 
	built as modules (IPv6 for example)

Changes from 'take15' patchset:
 * converted kevent_timer to high-resolution timers, this forces timer API update at
	http://linux-net.osdl.org/index.php/Kevent
 * use struct ukevent* instead of void * in syscalls (documentation has been updated)
 * added warning in kevent_add_ukevent() if ring has broken index (for testing)

Changes from 'take14' patchset:
 * added kevent_wait()
    This syscall waits until either timeout expires or at least one event
    becomes ready. It also commits that @num events from @start are processed
    by userspace and thus can be be removed or rearmed (depending on it's flags).
    It can be used for commit events read by userspace through mmap interface.
    Example userspace code (evtest.c) can be found on project's homepage.
 * added socket notifications (send/recv/accept)

Changes from 'take13' patchset:
 * do not get lock aroung user data check in __kevent_search()
 * fail early if there were no registered callbacks for given type of kevent
 * trailing whitespace cleanup

Changes from 'take12' patchset:
 * remove non-chardev interface for initialization
 * use pointer to kevent_mring instead of unsigned longs
 * use aligned 64bit type in raw user data (can be used by high-res timer if needed)
 * simplified enqueue/dequeue callbacks and kevent initialization
 * use nanoseconds for timeout
 * put number of milliseconds into timer's return data
 * move some definitions into user-visible header
 * removed filenames from comments

Changes from 'take11' patchset:
 * include missing headers into patchset
 * some trivial code cleanups (use goto instead of if/else games and so on)
 * some whitespace cleanups
 * check for ready_callback() callback before main loop which should save us some ticks

Changes from 'take10' patchset:
 * removed non-existent prototypes
 * added helper function for kevent_registered_callbacks
 * fixed 80 lines comments issues
 * added shared between userspace and kernelspace header instead of embedd them in one
 * core restructuring to remove forward declarations
 * s o m e w h i t e s p a c e c o d y n g s t y l e c l e a n u p
 * use vm_insert_page() instead of remap_pfn_range()

Changes from 'take9' patchset:
 * fixed ->nopage method

Changes from 'take8' patchset:
 * fixed mmap release bug
 * use module_init() instead of late_initcall()
 * use better structures for timer notifications

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was
learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from
userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 7/8] kevent: Signal notifications.
  2006-12-17 13:53             ` [take28-resend_2->0 6/8] kevent: Pipe notifications Evgeniy Polyakov
@ 2006-12-17 13:53               ` Evgeniy Polyakov
  2006-12-17 13:53                 ` [take28-resend_2->0 8/8] kevent: Kevent posix timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Signal notifications.

This type of notifications allows to deliver signals through kevent queue.
One can find example application signal.c on project homepage.

If KEVENT_SIGNAL_NOMASK bit is set in raw_u64 id then signal will be
delivered only through queue, otherwise both delivery types are used - old
through update of mask of pending signals and through queue.

If signal is delivered only through kevent queue mask of pending signals
is not updated at all, which is equal to putting signal into blocked mask,
but with delivery of that signal through kevent queue.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc4a987..ef38a3c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/processor.h>
 
@@ -1013,6 +1014,10 @@ struct task_struct {
 #ifdef	CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info *delays;
 #endif
+#ifdef CONFIG_KEVENT_SIGNAL
+	struct kevent_storage st;
+	u32 kevent_signals;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff --git a/kernel/fork.c b/kernel/fork.c
index 1c999f3..e5b5b14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -46,6 +46,7 @@
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
+#include <linux/kevent.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -115,6 +116,9 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_fini(&tsk->st);
+#endif
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -1121,6 +1125,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (retval)
 		goto bad_fork_cleanup_namespace;
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_init(p, &p->st);
+#endif
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/kevent/kevent_signal.c b/kernel/kevent/kevent_signal.c
new file mode 100644
index 0000000..abe3972
--- /dev/null
+++ b/kernel/kevent/kevent_signal.c
@@ -0,0 +1,94 @@
+/*
+ * 	kevent_signal.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+
+static int kevent_signal_callback(struct kevent *k)
+{
+	struct task_struct *tsk = k->st->origin;
+	int sig = k->event.id.raw[0];
+	int ret = 0;
+
+	if (sig == tsk->kevent_signals)
+		ret = 1;
+
+	if (ret && (k->event.id.raw_u64 & KEVENT_SIGNAL_NOMASK))
+		tsk->kevent_signals |= 0x80000000;
+
+	return ret;
+}
+
+int kevent_signal_enqueue(struct kevent *k)
+{
+	int err;
+
+	err = kevent_storage_enqueue(&current->st, k);
+	if (err)
+		goto err_out_exit;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_exit:
+	return err;
+}
+
+int kevent_signal_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+	return 0;
+}
+
+int kevent_signal_notify(struct task_struct *tsk, int sig)
+{
+	tsk->kevent_signals = sig;
+	kevent_storage_ready(&tsk->st, NULL, KEVENT_SIGNAL_DELIVERY);
+	return (tsk->kevent_signals & 0x80000000);
+}
+
+static int __init kevent_init_signal(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_signal_callback,
+		.enqueue = &kevent_signal_enqueue,
+		.dequeue = &kevent_signal_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SIGNAL);
+}
+module_init(kevent_init_signal);
diff --git a/kernel/signal.c b/kernel/signal.c
index fb5da6d..d3d3594 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/kevent.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -703,6 +704,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
 	struct sigqueue * q = NULL;
 	int ret = 0;
+	
+	if (kevent_signal_notify(t, sig))
+		return 1;
 
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
@@ -782,6 +786,17 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	ret = send_signal(sig, info, t, &t->pending);
 	if (!ret && !sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 out:
 	return ret;
 }
@@ -971,6 +986,17 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	 * to avoid several races.
 	 */
 	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 	if (unlikely(ret))
 		return ret;
 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 6/8] kevent: Pipe notifications.
  2006-12-17 13:53           ` [take28-resend_2->0 5/8] kevent: Timer notifications Evgeniy Polyakov
@ 2006-12-17 13:53             ` Evgeniy Polyakov
  2006-12-17 13:53               ` [take28-resend_2->0 7/8] kevent: Signal notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Pipe notifications.


diff --git a/fs/pipe.c b/fs/pipe.c
index f3b6f71..aeaee9c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -16,6 +16,7 @@
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -312,6 +313,7 @@ redo:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 			wake_up_interruptible_sync(&pipe->wait);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
@@ -321,6 +323,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
@@ -490,6 +493,7 @@ redo2:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 			wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
@@ -501,6 +505,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
@@ -605,6 +610,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 		free_pipe_info(inode);
 	} else {
 		wake_up_interruptible(&pipe->wait);
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
diff --git a/kernel/kevent/kevent_pipe.c b/kernel/kevent/kevent_pipe.c
new file mode 100644
index 0000000..91dc1eb
--- /dev/null
+++ b/kernel/kevent/kevent_pipe.c
@@ -0,0 +1,123 @@
+/*
+ * 	kevent_pipe.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+#include <linux/pipe_fs_i.h>
+
+static int kevent_pipe_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int nrbufs = pipe->nrbufs;
+
+	if (k->event.event & KEVENT_SOCKET_RECV && nrbufs > 0) {
+		if (!pipe->writers)
+			return -1;
+		return 1;
+	}
+	
+	if (k->event.event & KEVENT_SOCKET_SEND && nrbufs < PIPE_BUFFERS) {
+		if (!pipe->readers)
+			return -1;
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_pipe_enqueue(struct kevent *k)
+{
+	struct file *pipe;
+	int err = -EBADF;
+	struct inode *inode;
+
+	pipe = fget(k->event.id.raw[0]);
+	if (!pipe)
+		goto err_out_exit;
+
+	inode = igrab(pipe->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = -EINVAL;
+	if (!S_ISFIFO(inode->i_mode))
+		goto err_out_iput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	fput(pipe);
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(pipe);
+err_out_exit:
+	return err;
+}
+
+int kevent_pipe_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_pipe_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
+
+static int __init kevent_init_pipe(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_pipe_callback,
+		.enqueue = &kevent_pipe_enqueue,
+		.dequeue = &kevent_pipe_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_PIPE);
+}
+module_init(kevent_init_pipe);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 5/8] kevent: Timer notifications.
  2006-12-17 13:53         ` [take28-resend_2->0 4/8] kevent: Socket notifications Evgeniy Polyakov
@ 2006-12-17 13:53           ` Evgeniy Polyakov
  2006-12-17 13:53             ` [take28-resend_2->0 6/8] kevent: Pipe notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Timer notifications.

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

This subsystem uses high-resolution timers.
id.raw[0] is used as number of seconds
id.raw[1] is used as number of nanoseconds

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..c21a155
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,114 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+struct kevent_timer
+{
+	struct hrtimer		ktimer;
+	struct kevent_storage	ktimer_storage;
+	struct kevent		*ktimer_event;
+};
+
+static int kevent_timer_func(struct hrtimer *timer)
+{
+	struct kevent_timer *t = container_of(timer, struct kevent_timer, ktimer);
+	struct kevent *k = t->ktimer_event;
+
+	kevent_storage_ready(&t->ktimer_storage, NULL, KEVENT_MASK_ALL);
+	hrtimer_forward(timer, timer->base->softirq_time,
+			ktime_set(k->event.id.raw[0], k->event.id.raw[1]));
+	return HRTIMER_RESTART;
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	int err;
+	struct kevent_timer *t;
+
+	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	hrtimer_init(&t->ktimer, CLOCK_MONOTONIC, HRTIMER_REL);
+	t->ktimer.expires = ktime_set(k->event.id.raw[0], k->event.id.raw[1]);
+	t->ktimer.function = kevent_timer_func;
+	t->ktimer_event = k;
+
+	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(&t->ktimer_storage, k);
+	if (err)
+		goto err_out_st_fini;
+
+	hrtimer_start(&t->ktimer, t->ktimer.expires, HRTIMER_REL);
+
+	return 0;
+
+err_out_st_fini:
+	kevent_storage_fini(&t->ktimer_storage);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
+
+	hrtimer_cancel(&t->ktimer);
+	kevent_storage_dequeue(st, k);
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	k->event.ret_data[0] = jiffies_to_msecs(jiffies);
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &kevent_timer_callback,
+		.enqueue = &kevent_timer_enqueue,
+		.dequeue = &kevent_timer_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&tc, KEVENT_TIMER);
+}
+module_init(kevent_init_timer);
+


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_2->0 8/8] kevent: Kevent posix timer notifications.
  2006-12-17 13:53               ` [take28-resend_2->0 7/8] kevent: Signal notifications Evgeniy Polyakov
@ 2006-12-17 13:53                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-17 13:53 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Kevent posix timer notifications.

Simple extensions to POSIX timers which allows
to deliver notification of the timer expiration
through kevent queue.

Example application posix_timer.c can be found
in archive on project homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 8786e01..3768746 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -235,6 +235,7 @@ typedef struct siginfo {
 #define SIGEV_NONE	1	/* other notification: meaningless */
 #define SIGEV_THREAD	2	/* deliver via thread creation */
 #define SIGEV_THREAD_ID 4	/* deliver to thread */
+#define SIGEV_KEVENT	8	/* deliver through kevent queue */
 
 /*
  * This works because the alignment is ok on all current architectures
@@ -260,6 +261,8 @@ typedef struct sigevent {
 			void (*_function)(sigval_t);
 			void *_attribute;	/* really pthread_attr_t */
 		} _sigev_thread;
+
+		int kevent_fd;
 	} _sigev_un;
 } sigevent_t;
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f..4b9deb4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/kevent_storage.h>
 
 union cpu_time_count {
 	cputime_t cpu;
@@ -49,6 +50,9 @@ struct k_itimer {
 	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
+#ifdef CONFIG_KEVENT_TIMER
+	struct kevent_storage st;
+#endif
 	union {
 		struct {
 			struct hrtimer timer;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5ebcc1..74270f8 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,6 +48,8 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/module.h>
+#include <linux/kevent.h>
+#include <linux/file.h>
 
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
@@ -224,6 +226,100 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+#ifdef CONFIG_KEVENT_TIMER
+static int posix_kevent_enqueue(struct kevent *k)
+{
+	/*
+	 * It is not ugly - there is no pointer in the id field union, 
+	 * but its size is 64bits, which is ok for any known pointer size.
+	 */
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	return kevent_storage_enqueue(&tmr->st, k);
+}
+static int posix_kevent_dequeue(struct kevent *k)
+{
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	kevent_storage_dequeue(&tmr->st, k);
+	return 0;
+}
+static int posix_kevent_callback(struct kevent *k)
+{
+	return 1;
+}
+static int posix_kevent_init(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &posix_kevent_callback,
+		.enqueue = &posix_kevent_enqueue,
+		.dequeue = &posix_kevent_dequeue,
+		.flags = KEVENT_CALLBACKS_KERNELONLY};
+
+	return kevent_add_callbacks(&tc, KEVENT_POSIX_TIMER);
+}
+
+extern struct file_operations kevent_user_fops;
+
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	struct ukevent uk;
+	struct file *file;
+	struct kevent_user *u;
+	int err;
+
+	file = fget(fd);
+	if (!file) {
+		err = -EBADF;
+		goto err_out;
+	}
+
+	if (file->f_op != &kevent_user_fops) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	u = file->private_data;
+
+	memset(&uk, 0, sizeof(struct ukevent));
+
+	uk.event = KEVENT_MASK_ALL;
+	uk.type = KEVENT_POSIX_TIMER;
+	uk.id.raw_u64 = (unsigned long)(tmr); /* Just cast to something unique */
+	uk.req_flags = KEVENT_REQ_ONESHOT | KEVENT_REQ_ALWAYS_QUEUE;
+	uk.ptr = tmr->it_sigev_value.sival_ptr;
+
+	err = kevent_user_add_ukevent(&uk, u);
+	if (err)
+		goto err_out_fput;
+
+	fput(file);
+
+	return 0;
+
+err_out_fput:
+	fput(file);
+err_out:
+	return err;
+}
+
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+	kevent_storage_fini(&tmr->st);
+}
+#else
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	return -ENOSYS;
+}
+static int posix_kevent_init(void)
+{
+	return 0;
+}
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+}
+#endif
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -241,6 +337,11 @@ static __init int init_posix_timers(void)
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
+	if (posix_kevent_init()) {
+		printk(KERN_ERR "Failed to initialize kevent posix timers.\n");
+		BUG();
+	}
+
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, 0, NULL, NULL);
 	idr_init(&posix_timers_id);
@@ -343,23 +444,29 @@ static int posix_timer_fn(struct hrtimer *timer)
 
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
+	
+	if (timr->it_sigev_notify == SIGEV_KEVENT) {
+#ifdef CONFIG_KEVENT_TIMER
+		kevent_storage_ready(&timr->st, NULL, KEVENT_MASK_ALL);
+#endif
+	} else {
+		if (timr->it.real.interval.tv64 != 0)
+			si_private = ++timr->it_requeue_pending;
 
-	if (timr->it.real.interval.tv64 != 0)
-		si_private = ++timr->it_requeue_pending;
-
-	if (posix_timer_event(timr, si_private)) {
-		/*
-		 * signal was not sent because of sig_ignor
-		 * we will not get a call back to restart it AND
-		 * it should be restarted.
-		 */
-		if (timr->it.real.interval.tv64 != 0) {
-			timr->it_overrun +=
-				hrtimer_forward(timer,
-						timer->base->softirq_time,
-						timr->it.real.interval);
-			ret = HRTIMER_RESTART;
-			++timr->it_requeue_pending;
+		if (posix_timer_event(timr, si_private)) {
+			/*
+			 * signal was not sent because of sig_ignor
+			 * we will not get a call back to restart it AND
+			 * it should be restarted.
+			 */
+			if (timr->it.real.interval.tv64 != 0) {
+				timr->it_overrun +=
+					hrtimer_forward(timer,
+							timer->base->softirq_time,
+							timr->it.real.interval);
+				ret = HRTIMER_RESTART;
+				++timr->it_requeue_pending;
+			}
 		}
 	}
 
@@ -407,6 +514,9 @@ static struct k_itimer * alloc_posix_timer(void)
 		kmem_cache_free(posix_timers_cache, tmr);
 		tmr = NULL;
 	}
+#ifdef CONFIG_KEVENT_TIMER
+	kevent_storage_init(tmr, &tmr->st);
+#endif
 	return tmr;
 }
 
@@ -424,6 +534,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	if (unlikely(tmr->it_process) &&
 	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 		put_task_struct(tmr->it_process);
+	posix_kevent_fini_timer(tmr);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -496,40 +607,52 @@ sys_timer_create(const clockid_t which_clock,
 		new_timer->it_sigev_signo = event.sigev_signo;
 		new_timer->it_sigev_value = event.sigev_value;
 
-		read_lock(&tasklist_lock);
-		if ((process = good_sigevent(&event))) {
-			/*
-			 * We may be setting up this process for another
-			 * thread.  It may be exiting.  To catch this
-			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the siglock will catch
-			 * him before it is too late (in exit_itimers).
-			 *
-			 * The exec case is a bit more invloved but easy
-			 * to code.  If the process is in our thread
-			 * group (and it must be or we would not allow
-			 * it here) and is doing an exec, it will cause
-			 * us to be killed.  In this case it will wait
-			 * for us to die which means we can finish this
-			 * linkage with our last gasp. I.e. no code :)
-			 */
+		if (event.sigev_notify == SIGEV_KEVENT) {
+			error = posix_kevent_init_timer(new_timer, event._sigev_un.kevent_fd);
+			if (error)
+				goto out;
+
+			process = current->group_leader;
 			spin_lock_irqsave(&process->sighand->siglock, flags);
-			if (!(process->flags & PF_EXITING)) {
-				new_timer->it_process = process;
-				list_add(&new_timer->list,
-					 &process->signal->posix_timers);
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-					get_task_struct(process);
-			} else {
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				process = NULL;
+			new_timer->it_process = process;
+			list_add(&new_timer->list, &process->signal->posix_timers);
+			spin_unlock_irqrestore(&process->sighand->siglock, flags);
+		} else {
+			read_lock(&tasklist_lock);
+			if ((process = good_sigevent(&event))) {
+				/*
+				 * We may be setting up this process for another
+				 * thread.  It may be exiting.  To catch this
+				 * case the we check the PF_EXITING flag.  If
+				 * the flag is not set, the siglock will catch
+				 * him before it is too late (in exit_itimers).
+				 *
+				 * The exec case is a bit more invloved but easy
+				 * to code.  If the process is in our thread
+				 * group (and it must be or we would not allow
+				 * it here) and is doing an exec, it will cause
+				 * us to be killed.  In this case it will wait
+				 * for us to die which means we can finish this
+				 * linkage with our last gasp. I.e. no code :)
+				 */
+				spin_lock_irqsave(&process->sighand->siglock, flags);
+				if (!(process->flags & PF_EXITING)) {
+					new_timer->it_process = process;
+					list_add(&new_timer->list,
+						 &process->signal->posix_timers);
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+						get_task_struct(process);
+				} else {
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					process = NULL;
+				}
+			}
+			read_unlock(&tasklist_lock);
+			if (!process) {
+				error = -EINVAL;
+				goto out;
 			}
-		}
-		read_unlock(&tasklist_lock);
-		if (!process) {
-			error = -EINVAL;
-			goto out;
 		}
 	} else {
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
  2006-12-17 13:53 ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
  2006-12-17 13:53   ` [take28-resend_2->0 1/8] kevent: Description Evgeniy Polyakov
@ 2006-12-19  3:47   ` Ulrich Drepper
  2006-12-19  4:01     ` Randy Dunlap
  2006-12-19  4:51     ` Evgeniy Polyakov
  1 sibling, 2 replies; 76+ messages in thread
From: Ulrich Drepper @ 2006-12-19  3:47 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: linux-kernel

It would help if one could actually get hold of the changes.

Neither at home nor on my gmail account did I get them all.  The gmane 
also only has 5 of the 9 mails or so.  Your archive only has sources 
from a couple of versions back.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
  2006-12-19  3:47   ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Ulrich Drepper
@ 2006-12-19  4:01     ` Randy Dunlap
  2006-12-19  4:51     ` Evgeniy Polyakov
  1 sibling, 0 replies; 76+ messages in thread
From: Randy Dunlap @ 2006-12-19  4:01 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Evgeniy Polyakov, linux-kernel

On Mon, 18 Dec 2006 19:47:21 -0800 Ulrich Drepper wrote:

> It would help if one could actually get hold of the changes.
> 
> Neither at home nor on my gmail account did I get them all.  The gmane 
> also only has 5 of the 9 mails or so.  Your archive only has sources 
> from a couple of versions back.

It looks like they are all archived at marc.theaimsgroup.com.
Try this:
http://marc.theaimsgroup.com/?l=linux-netdev&w=2&r=1&s=take28-resend_2&q=t

But it would be Good for them to have a home.

---
~Randy

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
  2006-12-19  3:47   ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Ulrich Drepper
  2006-12-19  4:01     ` Randy Dunlap
@ 2006-12-19  4:51     ` Evgeniy Polyakov
  2006-12-19  6:21       ` Ulrich Drepper
  1 sibling, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-19  4:51 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel

On Mon, Dec 18, 2006 at 07:47:21PM -0800, Ulrich Drepper (drepper@redhat.com) wrote:
> It would help if one could actually get hold of the changes.
> 
> Neither at home nor on my gmail account did I get them all.  The gmane 
> also only has 5 of the 9 mails or so.  Your archive only has sources 
> from a couple of versions back.

Kernel archive contains all changes as far as I can see at
marc.theaimsgroup.com and lkml.org

I've uploaded the latest changes to the homepage.

> -- 
> ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, 
> CA ❖

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
  2006-12-19  4:51     ` Evgeniy Polyakov
@ 2006-12-19  6:21       ` Ulrich Drepper
  2006-12-19  6:38         ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Ulrich Drepper @ 2006-12-19  6:21 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: linux-kernel

Evgeniy Polyakov wrote:
> I've uploaded the latest changes to the homepage.

Thanks.  But could you now update the patch so that it can be compiled 
with the current upstream kernel?  At least <linux/kevent.h> has 
problems because of file->st accesses.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
  2006-12-19  6:21       ` Ulrich Drepper
@ 2006-12-19  6:38         ` Evgeniy Polyakov
  2006-12-19  8:01           ` Ulrich Drepper
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-19  6:38 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel

On Mon, Dec 18, 2006 at 10:21:34PM -0800, Ulrich Drepper (drepper@redhat.com) wrote:
> Evgeniy Polyakov wrote:
> >I've uploaded the latest changes to the homepage.
> 
> Thanks.  But could you now update the patch so that it can be compiled 
> with the current upstream kernel?  At least <linux/kevent.h> has 
> problems because of file->st accesses.

file->st is only defined for poll/select events, if it is not specified
at compile time, functions in linux/kevent.h becomes void:

#ifdef CONFIG_KEVENT_POLL
static inline void kevent_init_file(struct file *file)
{
	kevent_storage_init(file, &file->st);
}

static inline void kevent_cleanup_file(struct file *file)
{
	kevent_storage_fini(&file->st);
}
#else
static inline void kevent_init_file(struct file *file) {}
static inline void kevent_cleanup_file(struct file *file) {}
#endif

What error messages do you see and what are kevent related config
changes?

> -- 
> ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, 
> CA ❖

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
  2006-12-19  6:38         ` Evgeniy Polyakov
@ 2006-12-19  8:01           ` Ulrich Drepper
  2006-12-19  8:56             ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Ulrich Drepper @ 2006-12-19  8:01 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: linux-kernel

Evgeniy Polyakov wrote:
> What error messages do you see and what are kevent related config
> changes?

ARCH=um

#define CONFIG_KEVENT_USER_STAT 1
#define CONFIG_KEVENT_PIPE 1
#define CONFIG_KEVENT_POLL 1
#define CONFIG_KEVENT_TIMER 1
#define CONFIG_KEVENT 1
#define CONFIG_KEVENT_SIGNAL 1
#define CONFIG_KEVENT_SOCKET 1



In file included from kernel/kevent/kevent.c:28:
include/linux/kevent.h: In function ‘kevent_init_file’:
include/linux/kevent.h:220: error: ‘struct file’ has no member named ‘st’
include/linux/kevent.h: In function ‘kevent_cleanup_file’:
include/linux/kevent.h:225: error: ‘struct file’ has no member named ‘st’

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_2->0 0/8] kevent: Generic event handling mechanism.
  2006-12-19  8:01           ` Ulrich Drepper
@ 2006-12-19  8:56             ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-19  8:56 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel

On Tue, Dec 19, 2006 at 12:01:35AM -0800, Ulrich Drepper (drepper@redhat.com) wrote:
> Evgeniy Polyakov wrote:
> >What error messages do you see and what are kevent related config
> >changes?
> 
> ARCH=um
> 
> #define CONFIG_KEVENT_USER_STAT 1
> #define CONFIG_KEVENT_PIPE 1
> #define CONFIG_KEVENT_POLL 1
> #define CONFIG_KEVENT_TIMER 1
> #define CONFIG_KEVENT 1
> #define CONFIG_KEVENT_SIGNAL 1
> #define CONFIG_KEVENT_SOCKET 1
> 
> 
> 
> In file included from kernel/kevent/kevent.c:28:
> include/linux/kevent.h: In function ‘kevent_init_file’:
> include/linux/kevent.h:220: error: ‘struct file’ has no member named 
> ‘st’
> include/linux/kevent.h: In function ‘kevent_cleanup_file’:
> include/linux/kevent.h:225: error: ‘struct file’ has no member named 
> ‘st’

Can you send me your linux/fs.h file to chek if it is correct.
Also does 'make prepare' fix the buid?

> -- 
> ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, 
> CA ❖

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 4/8] kevent: Socket notifications.
  2006-12-21  9:14       ` [take28-resend_1->0 3/8] kevent: poll/select() notifications Evgeniy Polyakov
@ 2006-12-21  9:14         ` Evgeniy Polyakov
  2006-12-21  9:14           ` [take28-resend_1->0 5/8] kevent: Timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Socket notifications.

This patch includes socket send/recv/accept notifications.
Using trivial web server based on kevent and this features
instead of epoll it's performance increased more than noticebly.
More details about various benchmarks and server itself 
(evserver_kevent.c) can be found on project's homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/inode.c b/fs/inode.c
index ada7643..2740617 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -164,12 +165,18 @@ static struct inode *alloc_inode(struct super_block *sb)
 		}
 		inode->i_private = 0;
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/include/net/sock.h b/include/net/sock.h
index edd4d73..d48ded8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -450,6 +451,21 @@ static inline int sk_stream_memory_free(struct sock *sk)
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -477,6 +493,7 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -679,21 +696,6 @@ static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0..69f4ad2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -857,6 +857,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..1798092
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,144 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	unsigned int events = SOCKET_I(inode)->ops->poll(SOCKET_I(inode)->file, SOCKET_I(inode), NULL);
+
+	if ((events & (POLLIN | POLLRDNORM)) && (k->event.event & (KEVENT_SOCKET_RECV | KEVENT_SOCKET_ACCEPT)))
+		return 1;
+	if ((events & (POLLOUT | POLLWRNORM)) && (k->event.event & KEVENT_SOCKET_SEND))
+		return 1;
+	if (events & (POLLERR | POLLHUP))
+		return -1;
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct inode *inode;
+	struct socket *sock;
+	int err = -EBADF;
+
+	sock = sockfd_lookup(k->event.id.raw[0], &err);
+	if (!sock)
+		goto err_out_exit;
+
+	inode = igrab(SOCK_INODE(sock));
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	sockfd_put(sock);
+err_out_exit:
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct socket *sock;
+
+	kevent_storage_dequeue(k->st, k);
+
+	sock = SOCKET_I(inode);
+	iput(inode);
+	sockfd_put(sock);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket)
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+}
+
+/*
+ * It is required for network protocols compiled as modules, like IPv6.
+ */
+EXPORT_SYMBOL_GPL(kevent_socket_notify);
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key kevent_sock_key;
+
+void kevent_socket_reinit(struct socket *sock)
+{
+	struct inode *inode = SOCK_INODE(sock);
+
+	lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+}
+
+void kevent_sk_reinit(struct sock *sk)
+{
+	if (sk->sk_socket) {
+		struct inode *inode = SOCK_INODE(sk->sk_socket);
+
+		lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+	}
+}
+#endif
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_socket_callback,
+		.enqueue = &kevent_socket_enqueue,
+		.dequeue = &kevent_socket_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SOCKET);
+}
+module_init(kevent_init_socket);
diff --git a/net/core/sock.c b/net/core/sock.c
index b77e155..7d5fa3e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1402,6 +1402,7 @@ static void sock_def_wakeup(struct sock *sk)
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1411,6 +1412,7 @@ static void sock_def_error_report(struct sock *sk)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1420,6 +1422,7 @@ static void sock_def_readable(struct sock *sk, int len)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1439,6 +1442,7 @@ static void sock_def_write_space(struct sock *sk)
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1489,6 +1493,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state		=	TCP_CLOSE;
 	sk->sk_socket		=	sock;
 
+	kevent_sk_reinit(sk);
+
 	sock_set_flag(sk, SOCK_ZAPPED);
 
 	if(sock)
@@ -1555,8 +1561,10 @@ void fastcall release_sock(struct sock *sk)
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f884ce..e7dd989 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3119,6 +3119,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c83938b..b0dd70d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -870,6 +871,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
diff --git a/net/socket.c b/net/socket.c
index 1bc4167..5582b4a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -85,6 +85,7 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -490,6 +491,8 @@ static struct socket *sock_alloc(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 
+	kevent_socket_reinit(sock);
+
 	get_cpu_var(sockets_in_use)++;
 	put_cpu_var(sockets_in_use);
 	return sock;


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 6/8] kevent: Pipe notifications.
  2006-12-21  9:14           ` [take28-resend_1->0 5/8] kevent: Timer notifications Evgeniy Polyakov
@ 2006-12-21  9:14             ` Evgeniy Polyakov
  2006-12-21  9:14               ` [take28-resend_1->0 7/8] kevent: Signal notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Pipe notifications.


diff --git a/fs/pipe.c b/fs/pipe.c
index f3b6f71..aeaee9c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -16,6 +16,7 @@
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -312,6 +313,7 @@ redo:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 			wake_up_interruptible_sync(&pipe->wait);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
@@ -321,6 +323,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
@@ -490,6 +493,7 @@ redo2:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 			wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
@@ -501,6 +505,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
@@ -605,6 +610,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 		free_pipe_info(inode);
 	} else {
 		wake_up_interruptible(&pipe->wait);
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
diff --git a/kernel/kevent/kevent_pipe.c b/kernel/kevent/kevent_pipe.c
new file mode 100644
index 0000000..91dc1eb
--- /dev/null
+++ b/kernel/kevent/kevent_pipe.c
@@ -0,0 +1,123 @@
+/*
+ * 	kevent_pipe.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+#include <linux/pipe_fs_i.h>
+
+static int kevent_pipe_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int nrbufs = pipe->nrbufs;
+
+	if (k->event.event & KEVENT_SOCKET_RECV && nrbufs > 0) {
+		if (!pipe->writers)
+			return -1;
+		return 1;
+	}
+	
+	if (k->event.event & KEVENT_SOCKET_SEND && nrbufs < PIPE_BUFFERS) {
+		if (!pipe->readers)
+			return -1;
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_pipe_enqueue(struct kevent *k)
+{
+	struct file *pipe;
+	int err = -EBADF;
+	struct inode *inode;
+
+	pipe = fget(k->event.id.raw[0]);
+	if (!pipe)
+		goto err_out_exit;
+
+	inode = igrab(pipe->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = -EINVAL;
+	if (!S_ISFIFO(inode->i_mode))
+		goto err_out_iput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	fput(pipe);
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(pipe);
+err_out_exit:
+	return err;
+}
+
+int kevent_pipe_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_pipe_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
+
+static int __init kevent_init_pipe(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_pipe_callback,
+		.enqueue = &kevent_pipe_enqueue,
+		.dequeue = &kevent_pipe_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_PIPE);
+}
+module_init(kevent_init_pipe);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 3/8] kevent: poll/select() notifications.
  2006-12-21  9:14     ` [take28-resend_1->0 2/8] kevent: Core files Evgeniy Polyakov
@ 2006-12-21  9:14       ` Evgeniy Polyakov
  2006-12-21  9:14         ` [take28-resend_1->0 4/8] kevent: Socket notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


poll/select() notifications.

This patch includes generic poll/select notifications.
kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake, a lot of allocations and so on).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/file_table.c b/fs/file_table.c
index bc35a40..0805547 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/kevent.h>
 #include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
@@ -119,6 +120,7 @@ struct file *get_empty_filp(void)
 	f->f_uid = tsk->fsuid;
 	f->f_gid = tsk->fsgid;
 	eventpoll_init_file(f);
+	kevent_init_file(f);
 	/* f->f_version: 0 */
 	return f;
 
@@ -164,6 +166,7 @@ void fastcall __fput(struct file *file)
 	 * in the file cleanup chain.
 	 */
 	eventpoll_release(file);
+	kevent_cleanup_file(file);
 	locks_remove_flock(file);
 
 	if (file->f_op && file->f_op->release)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5baf3a1..8bbf3a5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -276,6 +276,7 @@ extern int dir_notify_enable;
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -586,6 +587,10 @@ struct inode {
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -739,6 +744,9 @@ struct file {
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..7ccf7da
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,234 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait,
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont =
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead,
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k =
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, GFP_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err;
+	unsigned int revents;
+	unsigned long flags;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -EBADF;
+	
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+	
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, GFP_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+	
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+	} else {
+		revents = file->f_op->poll(file, &ctl.pt);
+		if (revents & k->event.event) {
+			err = 1;
+			goto out_dequeue;
+		}
+	}
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	return 0;
+
+out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_LAST_CHECK) {
+		return 1;
+	} else {
+		struct file *file = k->st->origin;
+		unsigned int revents = file->f_op->poll(file, NULL);
+
+		k->event.ret_data[0] = revents & k->event.event;
+		
+		return (revents & k->event.event);
+	}
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks pc = {
+		.callback = &kevent_poll_callback,
+		.enqueue = &kevent_poll_enqueue,
+		.dequeue = &kevent_poll_dequeue,
+		.flags = 0,
+	};
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache",
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache",
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+
+	kevent_add_callbacks(&pc, KEVENT_POLL);
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 2/8] kevent: Core files.
  2006-12-21  9:14   ` [take28-resend_1->0 1/8] kevent: Description Evgeniy Polyakov
@ 2006-12-21  9:14     ` Evgeniy Polyakov
  2006-12-21  9:14       ` [take28-resend_1->0 3/8] kevent: poll/select() notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Core files.

This patch includes core kevent files:
 * userspace controlling
 * kernelspace interfaces
 * initialization
 * notification state machines

Some bits of documentation can be found on project's homepage (and links from there):
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 7e639f7..a6221c2 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -318,3 +318,8 @@ ENTRY(sys_call_table)
 	.long sys_vmsplice
 	.long sys_move_pages
 	.long sys_getcpu
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl		/* 320 */
+	.long sys_kevent_wait
+	.long sys_kevent_commit
+	.long sys_kevent_init
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index b4aa875..dda2168 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -714,8 +714,13 @@ ia32_sys_call_table:
 	.quad compat_sys_get_robust_list
 	.quad sys_splice
 	.quad sys_sync_file_range
-	.quad sys_tee
+	.quad sys_tee			/* 315 */
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
 	.quad sys_getcpu
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl		/* 320 */
+	.quad sys_kevent_wait
+	.quad sys_kevent_commit
+	.quad sys_kevent_init
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index bd99870..57a6b8c 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -324,10 +324,15 @@
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
 #define __NR_getcpu		318
+#define __NR_kevent_get_events	319
+#define __NR_kevent_ctl		320
+#define __NR_kevent_wait	321
+#define __NR_kevent_commit	322
+#define __NR_kevent_init	323
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 319
+#define NR_syscalls 324
 #include <linux/err.h>
 
 /*
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 6137146..17d750d 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,20 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait	282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
+#define __NR_kevent_commit	283
+__SYSCALL(__NR_kevent_commit, sys_kevent_commit)
+#define __NR_kevent_init	284
+__SYSCALL(__NR_kevent_init, sys_kevent_init)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_init
 #include <linux/err.h>
 
 #ifndef __NO_STUBS
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..2c31df3
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,244 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
+#include <linux/ukevent.h>
+
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+/* @flags - flags for given set of callbacks. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+	unsigned int		flags;
+};
+
+#define KEVENT_CALLBACKS_KERNELONLY	0x1
+
+int kevent_event_is_allowed(struct ukevent *e);
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	/* Used for kevent freeing.*/
+	struct rcu_head		rcu_head;
+	struct ukevent		event;
+	/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+	spinlock_t		ulock;
+
+	/* Entry of user's tree. */
+	struct rb_node		kevent_node;
+	/* Entry of origin's queue. */
+	struct list_head	storage_entry;
+	/* Entry of user's ready. */
+	struct list_head	ready_entry;
+
+	u32			flags;
+
+	/* User who requested this kevent. */
+	struct kevent_user	*user;
+	/* Kevent container. */
+	struct kevent_storage	*st;
+
+	struct kevent_callbacks	callbacks;
+
+	/* Private data for different storages.
+	 * poll()/select storage has a list of wait_queue_t containers
+	 * for each ->poll() { poll_wait()' } here.
+	 */
+	void			*priv;
+};
+
+struct kevent_user
+{
+	struct rb_root		kevent_root;
+	spinlock_t		kevent_lock;
+	/* Number of queued kevents. */
+	unsigned int		kevent_num;
+
+	/* List of ready kevents. */
+	struct list_head	ready_list;
+	/* Number of ready kevents. */
+	unsigned int		ready_num;
+	/* Protects all manipulations with ready queue. */
+	spinlock_t 		ready_lock;
+
+	/* Protects against simultaneous kevent_user control manipulations. */
+	struct mutex		ctl_mutex;
+	/* Wait until some events are ready. */
+	wait_queue_head_t	wait;
+	/* Exit from syscall if someone wants us to do it */
+	int			need_exit;
+
+	/* Reference counter, increased for each new kevent. */
+	atomic_t		refcnt;
+
+	/* Mutex protecting userspace ring buffer. */
+	struct mutex		ring_lock;
+	/* Kernel index and size of the userspace ring buffer. */
+	unsigned int		kidx, uidx, ring_size, ring_over, full;
+	/* Pointer to userspace ring buffer. */
+	struct kevent_ring __user *pring;
+
+	/* Is used for absolute waiting times. */
+	struct hrtimer		timer;
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num, ring_num;
+	unsigned long		total;
+#endif
+};
+
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos);
+
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+void kevent_ready(struct kevent *k, int ret);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = u->ring_num = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	printk(KERN_INFO "%s: u: %p, wait: %lu, ring: %lu, immediately: %lu, total: %lu.\n",
+			__func__, u, u->wait_num, u->ring_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_ring(struct kevent_user *u)
+{
+	u->ring_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_ring(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+#ifdef CONFIG_KEVENT_SOCKET
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_POLL
+static inline void kevent_init_file(struct file *file)
+{
+	kevent_storage_init(file, &file->st);
+}
+
+static inline void kevent_cleanup_file(struct file *file)
+{
+	kevent_storage_fini(&file->st);
+}
+#else
+static inline void kevent_init_file(struct file *file) {}
+static inline void kevent_cleanup_file(struct file *file) {}
+#endif
+
+#ifdef CONFIG_KEVENT_PIPE
+extern void kevent_pipe_notify(struct inode *inode, u32 events);
+#else
+static inline void kevent_pipe_notify(struct inode *inode, u32 events) {}
+#endif
+
+#ifdef CONFIG_KEVENT_SIGNAL
+extern int kevent_signal_notify(struct task_struct *tsk, int sig);
+#else
+static inline int kevent_signal_notify(struct task_struct *tsk, int sig) {return 0;}
+#endif
+
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2d1c3d5..7574ec3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,8 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct ukevent;
+struct kevent_ring;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -599,4 +601,11 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, struct ukevent __user *buf);
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags);
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over);
+asmlinkage long sys_kevent_init(int ctl_fd, struct kevent_ring __user *ring, unsigned int num, unsigned int flags);
 #endif
diff --git a/include/linux/ukevent.h b/include/linux/ukevent.h
new file mode 100644
index 0000000..01932d0
--- /dev/null
+++ b/include/linux/ukevent.h
@@ -0,0 +1,185 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __UKEVENT_H
+#define __UKEVENT_H
+
+#include <linux/types.h>
+
+/*
+ * Kevent request flags.
+ */
+
+/* Process this event only once and then remove it. */
+#define KEVENT_REQ_ONESHOT	0x1
+/* Kevent wakes up only first thread interested in given event,
+ * or all threads if this flag is set.
+ */
+#define KEVENT_REQ_WAKEUP_ALL	0x2
+/* Edge Triggered behaviour. */
+#define KEVENT_REQ_ET		0x4
+/* Perform the last check on kevent (call appropriate callback) when
+ * kevent is marked as ready and has been removed from ready queue.
+ * If it will be confirmed that kevent is ready 
+ * (k->callbacks.callback(k) returns true) then kevent will be copied
+ * to userspace, otherwise it will be requeued back to storage. 
+ * Second (checking) call is performed with this bit _cleared_ so
+ * callback can detect when it was called from 
+ * kevent_storage_ready() - bit is set, or 
+ * kevent_dequeue_ready() - bit is cleared. 
+ * If kevent will be requeued, bit will be set again. */
+#define KEVENT_REQ_LAST_CHECK	0x8
+/*
+ * Always queue kevent even if it is immediately ready.
+ */
+#define KEVENT_REQ_ALWAYS_QUEUE	0x16
+
+/*
+ * Kevent return flags.
+ */
+/* Kevent is broken. */
+#define KEVENT_RET_BROKEN	0x1
+/* Kevent processing was finished successfully. */
+#define KEVENT_RET_DONE		0x2
+/* Kevent was not copied into ring buffer due to some error conditions. */
+#define KEVENT_RET_COPY_FAILED	0x4
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define KEVENT_PIPE		6
+#define KEVENT_SIGNAL		7
+#define KEVENT_POSIX_TIMER	8
+
+/* Used as array size, so must be equal to the last KEVENT type + 1 */
+#define	KEVENT_MAX		9
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO and PIPE events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+/*
+ * Signal events.
+ */
+#define KEVENT_SIGNAL_DELIVERY		0x1
+
+/* If set in raw64, then given signals will not be delivered
+ * in a usual way through sigmask update and signal callback 
+ * invocation. */
+#define KEVENT_SIGNAL_NOMASK	0x8000000000000000ULL
+
+/* Mask of all possible event values. */
+#define KEVENT_MASK_ALL		0xffffffff
+/* Empty mask of ready events. */
+#define KEVENT_MASK_EMPTY	0x0
+
+struct kevent_id
+{
+	union {
+		__u32		raw[2];
+		__u64		raw_u64 __attribute__((aligned(8)));
+	};
+};
+
+struct ukevent
+{
+	/* Id of this request, e.g. socket number, file descriptor and so on... */
+	struct kevent_id	id;
+	/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			type;
+	/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			event;
+	/* Per-event request flags */
+	__u32			req_flags;
+	/* Per-event return flags */
+	__u32			ret_flags;
+	/* Event return data. Event originator fills it with anything it likes. */
+	__u32			ret_data[2];
+	/* User's data. It is not used, just copied to/from user.
+	 * The whole structure is aligned to 8 bytes already, so the last union
+	 * is aligned properly.
+	 */
+	union {
+		__u32		user[2];
+		void		*ptr;
+	};
+};
+
+struct kevent_ring
+{
+	unsigned int		ring_kidx, ring_over;
+	struct ukevent		event[0];
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_READY	3
+
+/* Provided timespec parameter uses absolute time, i.e. 'wait until Aug 27, 2194' */
+#define KEVENT_FLAGS_ABSTIME	1
+
+#endif /* __UKEVENT_H */
diff --git a/init/Kconfig b/init/Kconfig
index d2eb7a8..c7d8250 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -201,6 +201,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..c6cdb0a
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,66 @@
+config KEVENT
+	bool "Kernel event notification mechanism" if EMBEDDED
+	default y
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback
+	  invocations, advanced timer notifications and other kernel
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents
+	  which are ready immediately at insertion time and number of kevents
+	  which were removed through readiness completion.
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use kevent subsystem for poll()/select()
+	  notifications.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+
+config KEVENT_PIPE
+	bool "Kernel event notifications for pipes"
+	depends on KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  pipe read/write operations.
+
+config KEVENT_SIGNAL
+	bool "Kernel event notifications for signals"
+	depends on KEVENT
+	default y
+	help
+	  This option enables signal delivery through KEVENT subsystem.
+	  Signals which were requested to be delivered through kevent
+	  subsystem must be registered through usual signal() and others
+	  syscalls, this option allows alternative delivery.
+	  With KEVENT_SIGNAL_NOMASK flag being set in kevent for set of 
+	  signals, they will not be delivered in a usual way.
+	  Kevents for appropriate signals are not copied when process forks,
+	  new process must add new kevents after fork(). Mask of signals
+	  is copied as before.
+
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..f98e0c8
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,6 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_PIPE) += kevent_pipe.o
+obj-$(CONFIG_KEVENT_SIGNAL) += kevent_signal.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..1781c57
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,265 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return -EINVAL;
+}
+
+static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX] __read_mostly;
+
+int kevent_event_is_allowed(struct ukevent *e)
+{
+	if (unlikely(e->type >= KEVENT_MAX))
+		return 0;
+
+	if (!kevent_registered_callbacks[e->type].callback)
+		return 0;
+
+	if (unlikely(kevent_registered_callbacks[e->type].callback == kevent_break))
+		return 0;
+
+	if (kevent_registered_callbacks[e->type].flags & KEVENT_CALLBACKS_KERNELONLY)
+		return 0;
+	
+	return 1;
+}
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos)
+{
+	struct kevent_callbacks *p;
+
+	if (pos >= KEVENT_MAX)
+		return -EINVAL;
+
+	p = &kevent_registered_callbacks[pos];
+
+	p->enqueue = (cb->enqueue) ? cb->enqueue : kevent_break;
+	p->dequeue = (cb->dequeue) ? cb->dequeue : kevent_break;
+	p->callback = (cb->callback) ? cb->callback : kevent_break;
+	p->flags = cb->flags;
+
+	printk(KERN_INFO "KEVENT: Added callbacks for type %d.\n", pos);
+	return 0;
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (unlikely(k->event.type >= KEVENT_MAX)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	if (!kevent_registered_callbacks[k->event.type].callback) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (unlikely(k->callbacks.callback == kevent_break)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue.
+ * It does not decrease origin's reference counter in any way
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+void kevent_ready(struct kevent *k, int ret)
+{
+	unsigned long flags;
+	int rem;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0)
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	else if (ret < 0)
+		k->event.ret_flags |= (KEVENT_RET_BROKEN | KEVENT_RET_DONE);
+	else
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && (k->flags & KEVENT_STORAGE)) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static int __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret;
+
+	ret = k->callbacks.callback(k);
+
+	kevent_ready(k, ret);
+
+	return ret;
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+	int wake_num = 0;
+
+	rcu_read_lock();
+	if (unlikely(ready_callback))
+		list_for_each_entry_rcu(k, &st->list, storage_entry)
+			(*ready_callback)(k);
+
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (event & k->event.event)
+			if ((k->event.req_flags & KEVENT_REQ_WAKEUP_ALL) || wake_num == 0)
+				if (__kevent_requeue(k, event))
+					wake_num++;
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, socket and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..7bd3d7a
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1360 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static kmem_cache_t *kevent_cache __read_mostly;
+static kmem_cache_t *kevent_user_cache __read_mostly;
+
+static int kevent_debug_abstime;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num || u->need_exit)
+		mask |= POLLIN | POLLRDNORM;
+	u->need_exit = 0;
+
+	return mask;
+}
+
+static inline unsigned int kevent_ring_space(struct kevent_user *u)
+{
+	if (u->full)
+		return 0;
+
+	return (u->uidx > u->kidx)?
+		(u->uidx - u->kidx):
+		(u->ring_size - (u->kidx - u->uidx));
+}
+
+static inline int kevent_ring_index_inc(unsigned int *pidx, unsigned int size)
+{
+	unsigned int idx = *pidx;
+
+	if (++idx >= size)
+		idx = 0;
+	*pidx = idx;
+	return (idx == 0);
+}
+
+/*
+ * Copies kevent into userspace ring buffer if it was initialized.
+ * Returns 
+ *  0 on success or if ring buffer is not used
+ *  -EAGAIN if there were no place for that kevent
+ *  -EFAULT if copy_to_user() failed.
+ *
+ *  Must be called under kevent_user->ring_lock locked.
+ */
+static int kevent_copy_ring_buffer(struct kevent *k)
+{
+	struct kevent_ring __user *ring;
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+	int err;
+
+	ring = u->pring;
+	if (!ring)
+		return 0;
+
+	if (!kevent_ring_space(u))
+		return -EAGAIN;
+
+	if (copy_to_user(&ring->event[u->kidx], &k->event, sizeof(struct ukevent))) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	kevent_ring_index_inc(&u->kidx, u->ring_size);
+
+	if (u->kidx == u->uidx)
+		u->full = 1;
+
+	if (put_user(u->kidx, &ring->ring_kidx)) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return err;
+}
+
+static struct kevent_user *kevent_user_alloc(struct kevent_ring __user *ring, unsigned int num)
+{
+	struct kevent_user *u;
+
+	u = kmem_cache_alloc(kevent_user_cache, GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	u->kevent_root = RB_ROOT;
+
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+	u->need_exit = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	mutex_init(&u->ring_lock);
+	u->kidx = u->uidx = u->ring_over = u->full = 0;
+
+	u->pring = ring;
+	u->ring_size = num;
+
+	hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+
+	return u;
+}
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		hrtimer_cancel(&u->timer);
+		kmem_cache_free(kevent_user_cache, u);
+	}
+}
+
+static inline int kevent_compare_id(struct kevent_id *left, struct kevent_id *right)
+{
+	if (left->raw_u64 > right->raw_u64)
+		return -1;
+
+	if (right->raw_u64 > left->raw_u64)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Must be called under u->ready_lock.
+ * This function unlinks kevent from ready queue.
+ */
+static inline void kevent_unlink_ready(struct kevent *k)
+{
+	list_del(&k->ready_entry);
+	k->flags &= ~KEVENT_READY;
+	k->user->ready_num--;
+}
+
+static void kevent_remove_ready(struct kevent *k)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY)
+		kevent_unlink_ready(k);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	if (deq)
+		kevent_dequeue(k);
+
+	kevent_remove_ready(k);
+
+	kevent_user_put(k->user);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events,
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+static struct kevent *__kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	if (u->ready_num) {
+		spin_lock_irqsave(&u->ready_lock, flags);
+		if (u->ready_num && !list_empty(&u->ready_list)) {
+			k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+			kevent_unlink_ready(k);
+		}
+		spin_unlock_irqrestore(&u->ready_lock, flags);
+	}
+
+	return k;
+}
+
+static struct kevent *kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	while (u->ready_num && !k) {
+		k = __kevent_dequeue_ready_one(u);
+
+		if (k && (k->event.req_flags & KEVENT_REQ_LAST_CHECK)) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&k->ulock, flags);
+			k->event.req_flags &= ~KEVENT_REQ_LAST_CHECK;
+			spin_unlock_irqrestore(&k->ulock, flags);
+
+			if (!k->callbacks.callback(k)) {
+				spin_lock_irqsave(&k->ulock, flags);
+				k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+				k->event.ret_flags = 0;
+				k->event.ret_data[0] = k->event.ret_data[1] = 0;
+				spin_unlock_irqrestore(&k->ulock, flags);
+				k = NULL;
+			}
+		} else
+			break;
+	}
+
+	return k;
+}
+
+static inline void kevent_copy_ring(struct kevent *k)
+{
+	unsigned long flags;
+
+	if (!k)
+		return;
+
+	if (kevent_copy_ring_buffer(k)) {
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kevent_dequeue_ready(struct kevent_user *u)
+{
+	struct kevent *k;
+
+	mutex_lock(&u->ring_lock);
+	k = kevent_dequeue_ready_one(u);
+	kevent_copy_ring(k);
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+/*
+ * Dequeue one entry from user's ready queue if there is space in ring buffer.
+ */
+static struct kevent *kevent_dequeue_ready_ring(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	mutex_lock(&u->ring_lock);
+	if (kevent_ring_space(u)) {
+		k = kevent_dequeue_ready_one(u);
+		kevent_copy_ring(k);
+	}
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+static void kevent_complete_ready(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		kevent_finish_user(k, 1);
+	else if (k->event.req_flags & KEVENT_REQ_ET) {
+		unsigned long flags;
+
+		/*
+		 * Edge-triggered behaviour: mark event as clear new one.
+		 */
+
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags = 0;
+		k->event.ret_data[0] = k->event.ret_data[1] = 0;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Search a kevent inside kevent tree for given ukevent.
+ */
+static struct kevent *__kevent_search(struct kevent_id *id, struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	struct rb_node *n = u->kevent_root.rb_node;
+	int cmp;
+
+	while (n) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		cmp = kevent_compare_id(&k->event.id, id);
+
+		if (cmp > 0)
+			n = n->rb_right;
+		else if (cmp < 0)
+			n = n->rb_left;
+		else {
+			ret = k;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k;
+	struct rb_node *n;
+
+	for (n = rb_first(&u->kevent_root); n; n = rb_next(n)) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_mark_ready(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		spin_lock(&k->st->lock);
+		kevent_ready(k, 1);
+		spin_unlock(&k->st->lock);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Mark appropriate kevents as ready.
+ * If number of events is zero just wake up one listener.
+ */
+static int kevent_user_ctl_ready(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	if (num > u->kevent_num)
+		return err;
+	
+	if (!num) {
+		u->need_exit = 1;
+		wake_up(&u->wait);
+		return 0;
+	}
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				err = kevent_mark_ready(&ukev[i], u);
+				if (err) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_mark_ready(&uk, u);
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = num - rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static int kevent_user_enqueue(struct kevent_user *u, struct kevent *new)
+{
+	unsigned long flags;
+	struct rb_node **p = &u->kevent_root.rb_node, *parent = NULL;
+	struct kevent *k;
+	int err = 0, cmp;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	while (*p) {
+		parent = *p;
+		k = rb_entry(parent, struct kevent, kevent_node);
+
+		cmp = kevent_compare_id(&k->event.id, &new->event.id);
+		if (cmp > 0)
+			p = &parent->rb_right;
+		else if (cmp < 0)
+			p = &parent->rb_left;
+		else {
+			err = -EEXIST;
+			break;
+		}
+	}
+	if (likely(!err)) {
+		rb_link_node(&new->kevent_node, parent, p);
+		rb_insert_color(&new->kevent_node, &u->kevent_root);
+		new->flags |= KEVENT_USER;
+		u->kevent_num++;
+		kevent_user_get(u);
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	err = kevent_user_enqueue(u, k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		kevent_finish_user(k, 0);
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	if (err < 0) {
+		uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE;
+		uk->ret_data[1] = err;
+	} else if (err > 0)
+		uk->ret_flags |= KEVENT_RET_DONE;
+	return err;
+}
+
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one
+ * and add them into appropriate kevent_storages,
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (!kevent_event_is_allowed(&ukev[i])) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], 
+							&ukev[i], 
+							sizeof(struct ukevent));
+					rnum++;
+				} else {
+					err = kevent_user_add_ukevent(&ukev[i], u);
+					if (err) {
+						kevent_stat_im(u);
+						if (i != rnum)
+							memcpy(&ukev[rnum], 
+								&ukev[i], 
+								sizeof(struct ukevent));
+						rnum++;
+					}
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		if (!kevent_event_is_allowed(&uk)) {
+			err = 1;
+		} else {
+			err = kevent_user_add_ukevent(&uk, u);
+			if (err)
+				kevent_stat_im(u);
+		}
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/* Used to wakeup waiting syscalls in case high-resolution timer is used. */
+static int kevent_user_wake(struct hrtimer *timer)
+{
+	struct kevent_user *u = container_of(timer, struct kevent_user, timer);
+
+	u->need_exit = 1;
+	wake_up(&u->wait);
+
+	return HRTIMER_NORESTART;
+}
+
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u,
+		unsigned int min_nr, unsigned int max_nr, struct timespec timeout,
+		void __user *buf, unsigned int flags)
+{
+	struct kevent *k;
+	int num = 0;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			return -EINVAL;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || u->need_exit, tm);
+	}
+	u->need_exit = 0;
+
+	while (num < max_nr && ((k = kevent_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent),
+					&k->event, sizeof(struct ukevent))) {
+			if (num == 0)
+				num = -EFAULT;
+			break;
+		}
+		kevent_complete_ready(k);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+struct file_operations kevent_user_fops = {
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	case KEVENT_CTL_READY:
+		err = kevent_user_ctl_ready(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - time to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf, flags);
+out_fput:
+	fput(file);
+	return err;
+}
+
+static struct vfsmount *kevent_mnt __read_mostly;
+
+static int kevent_get_sb(struct file_system_type *fs_type, int flags,
+		   const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "kevent", NULL, 0xaabbccdd, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= "keventfs",
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int keventfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations keventfs_dentry_operations = {
+	.d_delete	= keventfs_delete_dentry,
+};
+
+asmlinkage long sys_kevent_init(struct kevent_ring __user *ring, unsigned int num, unsigned int flags)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int err = -ENFILE, fd;
+	struct kevent_user *u;
+
+	if ((ring && !num) || (!ring && num) || (num == 1))
+		return -EINVAL;
+
+	file = get_empty_filp();
+	if (!file)
+		goto err_out_exit;
+
+	inode = new_inode(kevent_mnt->mnt_sb);
+	if (!inode)
+		goto err_out_fput;
+
+	inode->i_fop = &kevent_user_fops;
+
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	err = get_unused_fd();
+	if (err < 0)
+		goto err_out_iput;
+	fd = err;
+
+	err = -ENOMEM;
+	u = kevent_user_alloc(ring, num);
+	if (!u)
+		goto err_out_put_fd;
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(kevent_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_out_free;
+	dentry->d_op = &keventfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_pos = 0;
+	file->f_flags = O_RDONLY;
+	file->f_op = &kevent_user_fops;
+	file->f_mode = FMODE_READ;
+	file->f_version = 0;
+	file->private_data = u;
+
+	fd_install(fd, file);
+
+	return fd;
+
+err_out_free:
+	kmem_cache_free(kevent_user_cache, u);
+err_out_put_fd:
+	put_unused_fd(fd);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	put_filp(file);
+err_out_exit:
+	return err;
+}
+
+/*
+ * Commits user's index (consumer index).
+ * Must be called under u->ring_lock mutex held.
+ */
+static int __kevent_user_commit(struct kevent_user *u, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EOVERFLOW, comm = 0;
+	struct kevent_ring __user *ring = u->pring;
+
+	if (!ring) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	if (new_uidx >= u->ring_size) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if ((over != u->ring_over - 1) && (over != u->ring_over))
+		goto err_out_exit;
+
+	if (u->uidx < u->kidx && new_uidx > u->kidx) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if (new_uidx > u->uidx) {
+		if (over != u->ring_over)
+			goto err_out_exit;
+
+		comm = new_uidx - u->uidx;
+		u->uidx = new_uidx;
+		u->full = 0;
+	} else if (new_uidx < u->uidx) {
+		comm = u->ring_size - (u->uidx - new_uidx);
+		u->uidx = new_uidx;
+		u->full = 0;
+		u->ring_over++;
+
+		if (put_user(u->ring_over, &ring->ring_over)) {
+			err = -EFAULT;
+			goto err_out_exit;
+		}
+	}
+
+	return comm;
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * This syscall is used to perform waiting until there is free space in the ring
+ * buffer, in that case some events will be copied there.
+ * Function returns number of actually copied ready events in ring buffer.
+ * After this function is completed userspace ring->ring_kidx will be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @num - number of kevents to process.
+ * @old_uidx - the last index user is aware of.
+ * @timeout - time to wait until there is free space in kevent queue.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ *
+ * When we need to commit @num events, it means we should just remove first @num
+ * kevents from ready queue and copy them into the buffer. 
+ * Kevents will be copied into ring buffer in order they were placed into ready queue.
+ * One-shot kevents will be removed here, since there is no way they can be reused.
+ * Edge-triggered events will be requeued here for better performance.
+ */
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags)
+{
+	int err = -EINVAL, copied = 0;
+	struct file *file;
+	struct kevent_user *u;
+	struct kevent *k;
+	struct kevent_ring __user *ring;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+	unsigned int i;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	ring = u->pring;
+	if (!ring || num > u->ring_size)
+		goto out_fput;
+#if 0
+	/*
+	 * Allow to immediately update ring index, but it is not supported,
+	 * since syscall() has limited number of arguments which is actually
+	 * a good idea - use kevent_commit() instead.
+	 */
+	if ((u->uidx != new_uidx) && (new_uidx != 0xffffffff)) {
+		mutex_lock(&u->ring_lock);
+		__kevent_user_commit(u, new_uidx, over);
+		mutex_unlock(&u->ring_lock);
+	}
+#endif
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			goto out_fput;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || 
+				u->need_exit || old_uidx != u->uidx,
+				tm);
+	}
+	u->need_exit = 0;
+
+	for (i=0; i<num; ++i) {
+		k = kevent_dequeue_ready_ring(u);
+		if (!k)
+			break;
+		kevent_complete_ready(k);
+
+		if (k->event.ret_flags & KEVENT_RET_COPY_FAILED)
+			break;
+		kevent_stat_ring(u);
+		copied++;
+	}
+
+	fput(file);
+
+	return copied;
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to commit events in ring buffer, i.e. mark appropriate
+ * entries as unused by userspace so subsequent kevent_wait() could overwrite them.
+ * This fucntion returns actual number of kevents which were committed.
+ * After this function is completed userspace ring->ring_over can be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @new_uidx - the last committed kevent.
+ * @over - number of overflows given queue had.
+ */
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EINVAL, comm = 0;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	mutex_lock(&u->ring_lock);
+	err = __kevent_user_commit(u, new_uidx, over);
+	if (err < 0)
+		goto err_out_unlock;
+	comm = err;
+	mutex_unlock(&u->ring_lock);
+
+	fput(file);
+
+	return comm;
+
+err_out_unlock:
+	mutex_unlock(&u->ring_lock);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create caches and register
+ * filesystem to get control file descriptors from.
+ */
+static int __init kevent_user_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache",
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+	
+	kevent_user_cache = kmem_cache_create("kevent_user_cache",
+			sizeof(struct kevent_user), 0, SLAB_PANIC, NULL, NULL);
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		goto err_out_exit;
+	
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	err = PTR_ERR(kevent_mnt);
+	if (IS_ERR(kevent_mnt))
+		goto err_out_unreg;
+
+	printk(KERN_INFO "KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_unreg:
+	unregister_filesystem(&kevent_fs_type);
+err_out_exit:
+	kmem_cache_destroy(kevent_cache);
+	return err;
+}
+
+module_init(kevent_user_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7a3b2e7..3b7d35f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,12 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+cond_syscall(sys_kevent_wait);
+cond_syscall(sys_kevent_commit);
+cond_syscall(sys_kevent_init);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 8/8] kevent: Kevent posix timer notifications.
  2006-12-21  9:14               ` [take28-resend_1->0 7/8] kevent: Signal notifications Evgeniy Polyakov
@ 2006-12-21  9:14                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Kevent posix timer notifications.

Simple extensions to POSIX timers which allows
to deliver notification of the timer expiration
through kevent queue.

Example application posix_timer.c can be found
in archive on project homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 8786e01..3768746 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -235,6 +235,7 @@ typedef struct siginfo {
 #define SIGEV_NONE	1	/* other notification: meaningless */
 #define SIGEV_THREAD	2	/* deliver via thread creation */
 #define SIGEV_THREAD_ID 4	/* deliver to thread */
+#define SIGEV_KEVENT	8	/* deliver through kevent queue */
 
 /*
  * This works because the alignment is ok on all current architectures
@@ -260,6 +261,8 @@ typedef struct sigevent {
 			void (*_function)(sigval_t);
 			void *_attribute;	/* really pthread_attr_t */
 		} _sigev_thread;
+
+		int kevent_fd;
 	} _sigev_un;
 } sigevent_t;
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f..4b9deb4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/kevent_storage.h>
 
 union cpu_time_count {
 	cputime_t cpu;
@@ -49,6 +50,9 @@ struct k_itimer {
 	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
+#ifdef CONFIG_KEVENT_TIMER
+	struct kevent_storage st;
+#endif
 	union {
 		struct {
 			struct hrtimer timer;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5ebcc1..74270f8 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,6 +48,8 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/module.h>
+#include <linux/kevent.h>
+#include <linux/file.h>
 
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
@@ -224,6 +226,100 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+#ifdef CONFIG_KEVENT_TIMER
+static int posix_kevent_enqueue(struct kevent *k)
+{
+	/*
+	 * It is not ugly - there is no pointer in the id field union, 
+	 * but its size is 64bits, which is ok for any known pointer size.
+	 */
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	return kevent_storage_enqueue(&tmr->st, k);
+}
+static int posix_kevent_dequeue(struct kevent *k)
+{
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	kevent_storage_dequeue(&tmr->st, k);
+	return 0;
+}
+static int posix_kevent_callback(struct kevent *k)
+{
+	return 1;
+}
+static int posix_kevent_init(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &posix_kevent_callback,
+		.enqueue = &posix_kevent_enqueue,
+		.dequeue = &posix_kevent_dequeue,
+		.flags = KEVENT_CALLBACKS_KERNELONLY};
+
+	return kevent_add_callbacks(&tc, KEVENT_POSIX_TIMER);
+}
+
+extern struct file_operations kevent_user_fops;
+
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	struct ukevent uk;
+	struct file *file;
+	struct kevent_user *u;
+	int err;
+
+	file = fget(fd);
+	if (!file) {
+		err = -EBADF;
+		goto err_out;
+	}
+
+	if (file->f_op != &kevent_user_fops) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	u = file->private_data;
+
+	memset(&uk, 0, sizeof(struct ukevent));
+
+	uk.event = KEVENT_MASK_ALL;
+	uk.type = KEVENT_POSIX_TIMER;
+	uk.id.raw_u64 = (unsigned long)(tmr); /* Just cast to something unique */
+	uk.req_flags = KEVENT_REQ_ONESHOT | KEVENT_REQ_ALWAYS_QUEUE;
+	uk.ptr = tmr->it_sigev_value.sival_ptr;
+
+	err = kevent_user_add_ukevent(&uk, u);
+	if (err)
+		goto err_out_fput;
+
+	fput(file);
+
+	return 0;
+
+err_out_fput:
+	fput(file);
+err_out:
+	return err;
+}
+
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+	kevent_storage_fini(&tmr->st);
+}
+#else
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	return -ENOSYS;
+}
+static int posix_kevent_init(void)
+{
+	return 0;
+}
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+}
+#endif
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -241,6 +337,11 @@ static __init int init_posix_timers(void)
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
+	if (posix_kevent_init()) {
+		printk(KERN_ERR "Failed to initialize kevent posix timers.\n");
+		BUG();
+	}
+
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, 0, NULL, NULL);
 	idr_init(&posix_timers_id);
@@ -343,23 +444,29 @@ static int posix_timer_fn(struct hrtimer *timer)
 
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
+	
+	if (timr->it_sigev_notify == SIGEV_KEVENT) {
+#ifdef CONFIG_KEVENT_TIMER
+		kevent_storage_ready(&timr->st, NULL, KEVENT_MASK_ALL);
+#endif
+	} else {
+		if (timr->it.real.interval.tv64 != 0)
+			si_private = ++timr->it_requeue_pending;
 
-	if (timr->it.real.interval.tv64 != 0)
-		si_private = ++timr->it_requeue_pending;
-
-	if (posix_timer_event(timr, si_private)) {
-		/*
-		 * signal was not sent because of sig_ignor
-		 * we will not get a call back to restart it AND
-		 * it should be restarted.
-		 */
-		if (timr->it.real.interval.tv64 != 0) {
-			timr->it_overrun +=
-				hrtimer_forward(timer,
-						timer->base->softirq_time,
-						timr->it.real.interval);
-			ret = HRTIMER_RESTART;
-			++timr->it_requeue_pending;
+		if (posix_timer_event(timr, si_private)) {
+			/*
+			 * signal was not sent because of sig_ignor
+			 * we will not get a call back to restart it AND
+			 * it should be restarted.
+			 */
+			if (timr->it.real.interval.tv64 != 0) {
+				timr->it_overrun +=
+					hrtimer_forward(timer,
+							timer->base->softirq_time,
+							timr->it.real.interval);
+				ret = HRTIMER_RESTART;
+				++timr->it_requeue_pending;
+			}
 		}
 	}
 
@@ -407,6 +514,9 @@ static struct k_itimer * alloc_posix_timer(void)
 		kmem_cache_free(posix_timers_cache, tmr);
 		tmr = NULL;
 	}
+#ifdef CONFIG_KEVENT_TIMER
+	kevent_storage_init(tmr, &tmr->st);
+#endif
 	return tmr;
 }
 
@@ -424,6 +534,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	if (unlikely(tmr->it_process) &&
 	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 		put_task_struct(tmr->it_process);
+	posix_kevent_fini_timer(tmr);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -496,40 +607,52 @@ sys_timer_create(const clockid_t which_clock,
 		new_timer->it_sigev_signo = event.sigev_signo;
 		new_timer->it_sigev_value = event.sigev_value;
 
-		read_lock(&tasklist_lock);
-		if ((process = good_sigevent(&event))) {
-			/*
-			 * We may be setting up this process for another
-			 * thread.  It may be exiting.  To catch this
-			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the siglock will catch
-			 * him before it is too late (in exit_itimers).
-			 *
-			 * The exec case is a bit more invloved but easy
-			 * to code.  If the process is in our thread
-			 * group (and it must be or we would not allow
-			 * it here) and is doing an exec, it will cause
-			 * us to be killed.  In this case it will wait
-			 * for us to die which means we can finish this
-			 * linkage with our last gasp. I.e. no code :)
-			 */
+		if (event.sigev_notify == SIGEV_KEVENT) {
+			error = posix_kevent_init_timer(new_timer, event._sigev_un.kevent_fd);
+			if (error)
+				goto out;
+
+			process = current->group_leader;
 			spin_lock_irqsave(&process->sighand->siglock, flags);
-			if (!(process->flags & PF_EXITING)) {
-				new_timer->it_process = process;
-				list_add(&new_timer->list,
-					 &process->signal->posix_timers);
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-					get_task_struct(process);
-			} else {
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				process = NULL;
+			new_timer->it_process = process;
+			list_add(&new_timer->list, &process->signal->posix_timers);
+			spin_unlock_irqrestore(&process->sighand->siglock, flags);
+		} else {
+			read_lock(&tasklist_lock);
+			if ((process = good_sigevent(&event))) {
+				/*
+				 * We may be setting up this process for another
+				 * thread.  It may be exiting.  To catch this
+				 * case the we check the PF_EXITING flag.  If
+				 * the flag is not set, the siglock will catch
+				 * him before it is too late (in exit_itimers).
+				 *
+				 * The exec case is a bit more invloved but easy
+				 * to code.  If the process is in our thread
+				 * group (and it must be or we would not allow
+				 * it here) and is doing an exec, it will cause
+				 * us to be killed.  In this case it will wait
+				 * for us to die which means we can finish this
+				 * linkage with our last gasp. I.e. no code :)
+				 */
+				spin_lock_irqsave(&process->sighand->siglock, flags);
+				if (!(process->flags & PF_EXITING)) {
+					new_timer->it_process = process;
+					list_add(&new_timer->list,
+						 &process->signal->posix_timers);
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+						get_task_struct(process);
+				} else {
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					process = NULL;
+				}
+			}
+			read_unlock(&tasklist_lock);
+			if (!process) {
+				error = -EINVAL;
+				goto out;
 			}
-		}
-		read_unlock(&tasklist_lock);
-		if (!process) {
-			error = -EINVAL;
-			goto out;
 		}
 	} else {
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
       [not found] <3154985aa0591036@2ka.mipt.ru>
  2006-12-17 13:53 ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-12-21  9:14 ` Evgeniy Polyakov
  2006-12-21  9:14   ` [take28-resend_1->0 1/8] kevent: Description Evgeniy Polyakov
  2006-12-21 10:35   ` [take28-resend_1->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
  2006-12-23 16:51 ` [take29 " Evgeniy Polyakov
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Generic event handling mechanism.

Kevent is a generic subsytem which allows to handle event notifications.
It supports both level and edge triggered events. It is similar to
poll/epoll in some cases, but it is more scalable, it is faster and
allows to work with essentially eny kind of events.

Events are provided into kernel through control syscall and can be read
back through ring buffer or using usual syscalls.
Kevent update (i.e. readiness switching) happens directly from internals
of the appropriate state machine of the underlying subsytem (like
network, filesystem, timer or any other).

Homepage:
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Documentation page:
http://linux-net.osdl.org/index.php/Kevent

Consider for inclusion.

New benchmark, which can be a hoax though, can be found at 
http://tservice.net.ru/~s0mbre/blog/2006/11/30#2006_11_30
where kevent on amd64 with 1gb of ram can handle more than 7200 events per 
second with 8000 requests concurrency with 'ab' benchmark and lighttpd.
Although I tought it should not be published due to possible errors,
I decided to send it for review.

With this release I start 3 days resending timeout - i.e. each third day I 
will send either new version (if something new was requested and agreed to 
be implemented) or resending with back counter started from three. 
When back counter hits zero after three resending I consider there is no 
interest in subsystem and I will stop further sending.

Thanks for understanding and your time.

Changes from 'take27' patchset:
 * made kevent default yes in non embedded case.
 * added falgs to callback structures - currently used to check if kevent
 	can be requested from kernelspace only (posix timers) or 
	userspace (all others)

Changes from 'take26' patchset:
 * made kevent visible in config only in case of embedded setup.
 * added comment about KEVENT_MAX number.
 * spell fix.

Changes from 'take25' patchset:
 * use timespec as timeout parameter.
 * added high-resolution timer to handle absolute timeouts.
 * added flags to waiting and initialization syscalls.
 * kevent_commit() has new_uidx parameter.
 * kevent_wait() has old_uidx parameter, which, if not equal to u->uidx,
 	results in immediate wakeup (usefull for the case when entries
	are added asynchronously from kernel (not supported for now)).
 * added interface to mark any event as ready.
 * event POSIX timers support.
 * return -ENOSYS if there is no registered event type.
 * provided file descriptor must be checked for fifo type (spotted by Eric Dumazet).
 * signal notifications.
 * documentation update.
 * lighttpd patch updated (the latest benchmarks with lighttpd patch can be found in blog).

Changes from 'take24' patchset:
 * new (old (new)) ring buffer implementation with kernel and user indexes.
 * added initialization syscall instead of opening /dev/kevent
 * kevent_commit() syscall to commit ring buffer entries
 * changed KEVENT_REQ_WAKEUP_ONE flag to KEVENT_REQ_WAKEUP_ALL, kevent wakes
   only first thread always if that flag is not set
 * KEVENT_REQ_ALWAYS_QUEUE flag. If set, kevent will be queued into ready queue
   instead of copying back to userspace when kevent is ready immediately when
   it is added.
 * lighttpd patch (Hail! Although nothing really outstanding compared to epoll)

Changes from 'take23' patchset:
 * kevent PIPE notifications
 * KEVENT_REQ_LAST_CHECK flag, which allows to perform last check at dequeueing time
 * fixed poll/select notifications (were broken due to tree manipulations)
 * made Documentation/kevent.txt look nice in 80-col terminal
 * fix for copy_to_user() failure report for the first kevent (Andrew Morton)
 * minor function renames

Changes from 'take22' patchset:
 * new ring buffer implementation in process' memory
 * wakeup-one-thread flag
 * edge-triggered behaviour

Changes from 'take21' patchset:
 * minor cleanups (different return values, removed unneded variables, whitespaces and so on)
 * fixed bug in kevent removal in case when kevent being removed
   is the same as overflow_kevent (spotted by Eric Dumazet)

Changes from 'take20' patchset:
 * new ring buffer implementation
 * removed artificial limit on possible number of kevents

Changes from 'take19' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take18' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take17' patchset:
 * Use RB tree instead of hash table. 
	At least for a web sever, frequency of addition/deletion of new kevent 
	is comparable with number of search access, i.e. most of the time events 
	are added, accesed only couple of times and then removed, so it justifies 
	RB tree usage over AVL tree, since the latter does have much slower deletion 
	time (max O(log(N)) compared to 3 ops), 
	although faster search time (1.44*O(log(N)) vs. 2*O(log(N))). 
	So for kevents I use RB tree for now and later, when my AVL tree implementation 
	is ready, it will be possible to compare them.
 * Changed readiness check for socket notifications.

With both above changes it is possible to achieve more than 3380 req/second compared to 2200, 
sometimes 2500 req/second for epoll() for trivial web-server and httperf client on the same
hardware.
It is possible that above kevent limit is due to maximum allowed kevents in a time limit, which is
4096 events.

Changes from 'take16' patchset:
 * misc cleanups (__read_mostly, const ...)
 * created special macro which is used for mmap size (number of pages) calculation
 * export kevent_socket_notify(), since it is used in network protocols which can be 
	built as modules (IPv6 for example)

Changes from 'take15' patchset:
 * converted kevent_timer to high-resolution timers, this forces timer API update at
	http://linux-net.osdl.org/index.php/Kevent
 * use struct ukevent* instead of void * in syscalls (documentation has been updated)
 * added warning in kevent_add_ukevent() if ring has broken index (for testing)

Changes from 'take14' patchset:
 * added kevent_wait()
    This syscall waits until either timeout expires or at least one event
    becomes ready. It also commits that @num events from @start are processed
    by userspace and thus can be be removed or rearmed (depending on it's flags).
    It can be used for commit events read by userspace through mmap interface.
    Example userspace code (evtest.c) can be found on project's homepage.
 * added socket notifications (send/recv/accept)

Changes from 'take13' patchset:
 * do not get lock aroung user data check in __kevent_search()
 * fail early if there were no registered callbacks for given type of kevent
 * trailing whitespace cleanup

Changes from 'take12' patchset:
 * remove non-chardev interface for initialization
 * use pointer to kevent_mring instead of unsigned longs
 * use aligned 64bit type in raw user data (can be used by high-res timer if needed)
 * simplified enqueue/dequeue callbacks and kevent initialization
 * use nanoseconds for timeout
 * put number of milliseconds into timer's return data
 * move some definitions into user-visible header
 * removed filenames from comments

Changes from 'take11' patchset:
 * include missing headers into patchset
 * some trivial code cleanups (use goto instead of if/else games and so on)
 * some whitespace cleanups
 * check for ready_callback() callback before main loop which should save us some ticks

Changes from 'take10' patchset:
 * removed non-existent prototypes
 * added helper function for kevent_registered_callbacks
 * fixed 80 lines comments issues
 * added shared between userspace and kernelspace header instead of embedd them in one
 * core restructuring to remove forward declarations
 * s o m e w h i t e s p a c e c o d y n g s t y l e c l e a n u p
 * use vm_insert_page() instead of remap_pfn_range()

Changes from 'take9' patchset:
 * fixed ->nopage method

Changes from 'take8' patchset:
 * fixed mmap release bug
 * use module_init() instead of late_initcall()
 * use better structures for timer notifications

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was
learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from
userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 5/8] kevent: Timer notifications.
  2006-12-21  9:14         ` [take28-resend_1->0 4/8] kevent: Socket notifications Evgeniy Polyakov
@ 2006-12-21  9:14           ` Evgeniy Polyakov
  2006-12-21  9:14             ` [take28-resend_1->0 6/8] kevent: Pipe notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Timer notifications.

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

This subsystem uses high-resolution timers.
id.raw[0] is used as number of seconds
id.raw[1] is used as number of nanoseconds

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..c21a155
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,114 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+struct kevent_timer
+{
+	struct hrtimer		ktimer;
+	struct kevent_storage	ktimer_storage;
+	struct kevent		*ktimer_event;
+};
+
+static int kevent_timer_func(struct hrtimer *timer)
+{
+	struct kevent_timer *t = container_of(timer, struct kevent_timer, ktimer);
+	struct kevent *k = t->ktimer_event;
+
+	kevent_storage_ready(&t->ktimer_storage, NULL, KEVENT_MASK_ALL);
+	hrtimer_forward(timer, timer->base->softirq_time,
+			ktime_set(k->event.id.raw[0], k->event.id.raw[1]));
+	return HRTIMER_RESTART;
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	int err;
+	struct kevent_timer *t;
+
+	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	hrtimer_init(&t->ktimer, CLOCK_MONOTONIC, HRTIMER_REL);
+	t->ktimer.expires = ktime_set(k->event.id.raw[0], k->event.id.raw[1]);
+	t->ktimer.function = kevent_timer_func;
+	t->ktimer_event = k;
+
+	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(&t->ktimer_storage, k);
+	if (err)
+		goto err_out_st_fini;
+
+	hrtimer_start(&t->ktimer, t->ktimer.expires, HRTIMER_REL);
+
+	return 0;
+
+err_out_st_fini:
+	kevent_storage_fini(&t->ktimer_storage);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
+
+	hrtimer_cancel(&t->ktimer);
+	kevent_storage_dequeue(st, k);
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	k->event.ret_data[0] = jiffies_to_msecs(jiffies);
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &kevent_timer_callback,
+		.enqueue = &kevent_timer_enqueue,
+		.dequeue = &kevent_timer_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&tc, KEVENT_TIMER);
+}
+module_init(kevent_init_timer);
+


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 1/8] kevent: Description.
  2006-12-21  9:14 ` [take28-resend_1->0 " Evgeniy Polyakov
@ 2006-12-21  9:14   ` Evgeniy Polyakov
  2006-12-21  9:14     ` [take28-resend_1->0 2/8] kevent: Core files Evgeniy Polyakov
  2006-12-21 10:35   ` [take28-resend_1->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
  1 sibling, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Description.


diff --git a/Documentation/kevent.txt b/Documentation/kevent.txt
new file mode 100644
index 0000000..2e03a3f
--- /dev/null
+++ b/Documentation/kevent.txt
@@ -0,0 +1,240 @@
+Description.
+
+int kevent_init(struct kevent_ring *ring, unsigned int ring_size, 
+	unsigned int flags);
+
+num - size of the ring buffer in events 
+ring - pointer to allocated ring buffer
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value: kevent control file descriptor or negative error value.
+
+ struct kevent_ring
+ {
+   unsigned int ring_kidx, ring_over;
+   struct ukevent event[0];
+ }
+
+ring_kidx - index in the ring buffer where kernel will put new events 
+		when kevent_wait() or kevent_get_events() is called 
+ring_over - number of overflows of ring_uidx happend from the start.
+	Overflow counter is used to prevent situation when two threads 
+	are going to free the same events, but one of them was scheduled 
+	away for too long, so ring indexes were wrapped, so when that 
+	thread will be awakened, it will free not those events, which 
+	it suppose to free.
+
+Example userspace code (ring_buffer.c) can be found on project's homepage.
+
+Each kevent syscall can be so called cancellation point in glibc, i.e. when 
+thread has been cancelled in kevent syscall, thread can be safely removed 
+and no events will be lost, since each syscall (kevent_wait() or 
+kevent_get_events()) will copy event into special ring buffer, accessible 
+from other threads or even processes (if shared memory is used).
+
+When kevent is removed (not dequeued when it is ready, but just removed), 
+even if it was ready, it is not copied into ring buffer, since if it is 
+removed, no one cares about it (otherwise user would wait until it becomes 
+ready and got it through usual way using kevent_get_events() or kevent_wait()) 
+and thus no need to copy it to the ring buffer.
+
+-------------------------------------------------------------------------------
+
+
+int kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent *arg);
+
+fd - is the file descriptor referring to the kevent queue to manipulate. 
+It is created by opening "/dev/kevent" char device, which is created with 
+dynamic minor number and major number assigned for misc devices. 
+
+cmd - is the requested operation. It can be one of the following:
+    KEVENT_CTL_ADD - add event notification 
+    KEVENT_CTL_REMOVE - remove event notification 
+    KEVENT_CTL_MODIFY - modify existing notification 
+    KEVENT_CTL_READY - mark existing events as ready, if number of events is zero,
+    	it just wakes up parked in syscall thread
+
+num - number of struct ukevent in the array pointed to by arg 
+arg - array of struct ukevent
+
+Return value: 
+ number of events processed or negative error value.
+
+When called, kevent_ctl will carry out the operation specified in the 
+cmd parameter.
+-------------------------------------------------------------------------------
+
+ int kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, 
+ 		struct timespec timeout, struct ukevent *buf, unsigned flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+min_nr - minimum number of completed events that kevent_get_events will block 
+	 waiting for 
+max_nr - number of struct ukevent in buf 
+timeout - time to wait before returning less than min_nr 
+	  events. If this is -1, then wait forever. 
+buf - pointer to an array of struct ukevent. 
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied or negative error value.
+
+kevent_get_events will wait timeout milliseconds for at least min_nr completed 
+events, copying completed struct ukevents to buf and deleting any 
+KEVENT_REQ_ONESHOT event requests. In nonblocking mode it returns as many 
+events as possible, but not more than max_nr. In blocking mode it waits until 
+timeout or if at least min_nr events are ready.
+
+This function copies event into ring buffer if it was initialized, if ring buffer
+is full, KEVENT_RET_COPY_FAILED flag is set in ret_flags field.
+-------------------------------------------------------------------------------
+
+ int kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+ 	struct timespec timeout, unsigned int flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+num - number of processed kevents 
+old_uidx - the last index user is aware of
+timeout - time to wait until there is free space in kevent queue
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied into ring buffer or negative error value.
+
+This syscall waits until either timeout expires or at least one event becomes 
+ready. It also copies events into special ring buffer. If ring buffer is full,
+it waits until there are ready events and then return.
+If kevent is one-shot kevent it is removed in this syscall.
+If kevent is edge-triggered (KEVENT_REQ_ET flag is set in 'req_flags') it is 
+requeued in this syscall for performance reasons.
+-------------------------------------------------------------------------------
+
+ int kevent_commit(int ctl_fd, unsigned int new_idx, unsigned int over);
+
+ctl_fd - file descriptor referring to the kevent queue 
+new_uidx - the last committed kevent
+over - overflow count for given $new_idx value
+
+Return value:
+ number of committed kevents or negative error value.
+
+This function commits, i.e. marks as empty, slots in the ring buffer, so
+they can be reused when userspace completes that entries processing.
+
+Overflow counter is used to prevent situation when two threads are going 
+to free the same events, but one of them was scheduled away for too long, 
+so ring indexes were wrapped, so when that thread will be awakened, it 
+will free not those events, which it suppose to free.
+
+It is possible that returned number of committed events will be smaller than
+requested number - it is possible when several threads try to commit the
+same events.
+-------------------------------------------------------------------------------
+
+The bulk of the interface is entirely done through the ukevent struct. 
+It is used to add event requests, modify existing event requests, 
+specify which event requests to remove, and return completed events.
+
+struct ukevent contains the following members:
+
+struct kevent_id id
+    Id of this request, e.g. socket number, file descriptor and so on 
+__u32 type
+    Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on 
+__u32 event
+    Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED 
+__u32 req_flags
+    Per-event request flags,
+
+    KEVENT_REQ_ONESHOT
+        event will be removed when it is ready 
+
+    KEVENT_REQ_WAKEUP_ALL
+        Kevent wakes up only first thread interested in given event, 
+	or all threads if this flag is set.
+
+    KEVENT_REQ_ET
+        Edge Triggered behaviour. It is an optimisation which allows to move 
+	ready and dequeued (i.e. copied to userspace) event to move into set 
+	of interest for given storage (socket, inode and so on) again. It is 
+	very usefull for cases when the same event should be used many times 
+	(like reading from pipe). It is similar to epoll()'s EPOLLET flag. 
+
+    KEVENT_REQ_LAST_CHECK
+        if set allows to perform the last check on kevent (call appropriate 
+	callback) when kevent is marked as ready and has been removed from 
+	ready queue. If it will be confirmed that kevent is ready 
+	(k->callbacks.callback(k) returns true) then kevent will be copied 
+	to userspace, otherwise it will be requeued back to storage. 
+	Second (checking) call is performed with this bit cleared, so callback 
+	can detect when it was called from kevent_storage_ready() - bit is set, 
+	or kevent_dequeue_ready() - bit is cleared. If kevent will be requeued, 
+	bit will be set again.
+
+   KEVENT_REQ_ALWAYS_QUEUE
+        If this flag is set kevent will be queued into ready queue if it is 
+	ready at enqueue time, otherwise it will be copied back to userspace
+	and will not be queued into the storage.
+
+__u32 ret_flags
+    Per-event return flags
+
+    KEVENT_RET_BROKEN
+        Kevent is broken 
+
+    KEVENT_RET_DONE
+        Kevent processing was finished successfully 
+
+    KEVENT_RET_COPY_FAILED
+        Kevent was not copied into ring buffer due to some error conditions. 
+
+__u32 ret_data
+    Event return data. Event originator fills it with anything it likes 
+    (for example timer notifications put number of milliseconds when timer 
+    has fired 
+union { __u32 user[2]; void *ptr; }
+    User's data. It is not used, just copied to/from user. The whole structure 
+    is aligned to 8 bytes already, so the last union is aligned properly. 
+
+-------------------------------------------------------------------------------
+
+Kevent waiting syscall flags.
+
+KEVENT_FLAGS_ABSTIME - provided timespec parameter contains absolute time, 
+	for example Aug 27, 2194, or time(NULL) + 10.
+
+-------------------------------------------------------------------------------
+
+Usage
+
+For KEVENT_CTL_ADD, all fields relevant to the event type must be filled 
+(id, type, event, req_flags). 
+After kevent_ctl(..., KEVENT_CTL_ADD, ...) returns each struct's ret_flags 
+should be checked to see if the event is already broken or done.
+
+For KEVENT_CTL_MODIFY, the id, req_flags, and user and event fields must be 
+set and an existing kevent request must have matching id and user fields. If 
+match is found, req_flags and event are replaced with the newly supplied 
+values and requeueing is started, so modified kevent can be checked and 
+probably marked as ready immediately. If a match can't be found, the 
+passed in ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is 
+always set.
+
+For KEVENT_CTL_REMOVE, the id and user fields must be set and an existing 
+kevent request must have matching id and user fields. If a match is found, 
+the kevent request is removed. If a match can't be found, the passed in 
+ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is always set.
+
+For kevent_get_events, the entire structure is returned.
+
+-------------------------------------------------------------------------------
+
+Usage cases
+
+kevent_timer
+struct ukevent should contain following fields:
+    type - KEVENT_TIMER 
+    event - KEVENT_TIMER_FIRED 
+    req_flags - KEVENT_REQ_ONESHOT if you want to fire that timer only once 
+    id.raw[0] - number of seconds after commit when this timer shout expire 
+    id.raw[0] - additional to number of seconds number of nanoseconds 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take28-resend_1->0 7/8] kevent: Signal notifications.
  2006-12-21  9:14             ` [take28-resend_1->0 6/8] kevent: Pipe notifications Evgeniy Polyakov
@ 2006-12-21  9:14               ` Evgeniy Polyakov
  2006-12-21  9:14                 ` [take28-resend_1->0 8/8] kevent: Kevent posix timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21  9:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik


Signal notifications.

This type of notifications allows to deliver signals through kevent queue.
One can find example application signal.c on project homepage.

If KEVENT_SIGNAL_NOMASK bit is set in raw_u64 id then signal will be
delivered only through queue, otherwise both delivery types are used - old
through update of mask of pending signals and through queue.

If signal is delivered only through kevent queue mask of pending signals
is not updated at all, which is equal to putting signal into blocked mask,
but with delivery of that signal through kevent queue.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc4a987..ef38a3c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/processor.h>
 
@@ -1013,6 +1014,10 @@ struct task_struct {
 #ifdef	CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info *delays;
 #endif
+#ifdef CONFIG_KEVENT_SIGNAL
+	struct kevent_storage st;
+	u32 kevent_signals;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff --git a/kernel/fork.c b/kernel/fork.c
index 1c999f3..e5b5b14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -46,6 +46,7 @@
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
+#include <linux/kevent.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -115,6 +116,9 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_fini(&tsk->st);
+#endif
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -1121,6 +1125,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (retval)
 		goto bad_fork_cleanup_namespace;
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_init(p, &p->st);
+#endif
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/kevent/kevent_signal.c b/kernel/kevent/kevent_signal.c
new file mode 100644
index 0000000..abe3972
--- /dev/null
+++ b/kernel/kevent/kevent_signal.c
@@ -0,0 +1,94 @@
+/*
+ * 	kevent_signal.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+
+static int kevent_signal_callback(struct kevent *k)
+{
+	struct task_struct *tsk = k->st->origin;
+	int sig = k->event.id.raw[0];
+	int ret = 0;
+
+	if (sig == tsk->kevent_signals)
+		ret = 1;
+
+	if (ret && (k->event.id.raw_u64 & KEVENT_SIGNAL_NOMASK))
+		tsk->kevent_signals |= 0x80000000;
+
+	return ret;
+}
+
+int kevent_signal_enqueue(struct kevent *k)
+{
+	int err;
+
+	err = kevent_storage_enqueue(&current->st, k);
+	if (err)
+		goto err_out_exit;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_exit:
+	return err;
+}
+
+int kevent_signal_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+	return 0;
+}
+
+int kevent_signal_notify(struct task_struct *tsk, int sig)
+{
+	tsk->kevent_signals = sig;
+	kevent_storage_ready(&tsk->st, NULL, KEVENT_SIGNAL_DELIVERY);
+	return (tsk->kevent_signals & 0x80000000);
+}
+
+static int __init kevent_init_signal(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_signal_callback,
+		.enqueue = &kevent_signal_enqueue,
+		.dequeue = &kevent_signal_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SIGNAL);
+}
+module_init(kevent_init_signal);
diff --git a/kernel/signal.c b/kernel/signal.c
index fb5da6d..d3d3594 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/kevent.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -703,6 +704,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
 	struct sigqueue * q = NULL;
 	int ret = 0;
+	
+	if (kevent_signal_notify(t, sig))
+		return 1;
 
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
@@ -782,6 +786,17 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	ret = send_signal(sig, info, t, &t->pending);
 	if (!ret && !sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 out:
 	return ret;
 }
@@ -971,6 +986,17 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	 * to avoid several races.
 	 */
 	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 	if (unlikely(ret))
 		return ret;
 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21  9:14 ` [take28-resend_1->0 " Evgeniy Polyakov
  2006-12-21  9:14   ` [take28-resend_1->0 1/8] kevent: Description Evgeniy Polyakov
@ 2006-12-21 10:35   ` Evgeniy Polyakov
  2006-12-21 10:41     ` Jeff Garzik
  1 sibling, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 10:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, Jeff Garzik

On Thu, Dec 21, 2006 at 12:14:17PM +0300, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> 
> Generic event handling mechanism.
> 
> Kevent is a generic subsytem which allows to handle event notifications.
> It supports both level and edge triggered events. It is similar to
> poll/epoll in some cases, but it is more scalable, it is faster and
> allows to work with essentially eny kind of events.
> 
> Events are provided into kernel through control syscall and can be read
> back through ring buffer or using usual syscalls.
> Kevent update (i.e. readiness switching) happens directly from internals
> of the appropriate state machine of the underlying subsytem (like
> network, filesystem, timer or any other).
> 
> Homepage:
> http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent
> 
> Documentation page:
> http://linux-net.osdl.org/index.php/Kevent
> 
> Consider for inclusion.

Due to this stall kevent inclusion into lighttpd CVS tree is postponed.

The last version will be released saturday or sunday, and looking into
overhelming flow of feedback comments on this feature, project will not
be released to linux-kernel@, after this I will
complete netchannels support and start kevent based AIO project - mostly
network AIO with new design, which is based on set of entities, which
can describe set of tasks which should be performed
asynchronously (from user point of view, although read and write
obviously must be done after open and before close), for example syscall
which gets as parameter destination socket and local filename (with
optional offset and length fields), which will asynchronously from user
point of view open a file and transfer requested part to the destination
socket and then return opened file descriptor (or it can be closed if
requested). Similar mechanism can be done for read/write calls.

Interested parties will be able to apply patches for theirs own kernels.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 10:35   ` [take28-resend_1->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-12-21 10:41     ` Jeff Garzik
  2006-12-21 10:49       ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Jeff Garzik @ 2006-12-21 10:41 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: linux-kernel, David Miller, Ulrich Drepper, Andrew Morton,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck

Evgeniy Polyakov wrote:
> On Thu, Dec 21, 2006 at 12:14:17PM +0300, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
>> Generic event handling mechanism.
>>
>> Kevent is a generic subsytem which allows to handle event notifications.
>> It supports both level and edge triggered events. It is similar to
>> poll/epoll in some cases, but it is more scalable, it is faster and
>> allows to work with essentially eny kind of events.
>>
>> Events are provided into kernel through control syscall and can be read
>> back through ring buffer or using usual syscalls.
>> Kevent update (i.e. readiness switching) happens directly from internals
>> of the appropriate state machine of the underlying subsytem (like
>> network, filesystem, timer or any other).
>>
>> Homepage:
>> http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent
>>
>> Documentation page:
>> http://linux-net.osdl.org/index.php/Kevent
>>
>> Consider for inclusion.
> 
> Due to this stall kevent inclusion into lighttpd CVS tree is postponed.
> 
> The last version will be released saturday or sunday, and looking into
> overhelming flow of feedback comments on this feature, project will not
> be released to linux-kernel@, after this I will
> complete netchannels support and start kevent based AIO project - mostly
> network AIO with new design, which is based on set of entities, which
> can describe set of tasks which should be performed
> asynchronously (from user point of view, although read and write
> obviously must be done after open and before close), for example syscall

kevent is being considered for inclusion, but there is no need to get 
impatient.  Once kevent code stops getting revised rapidly, Andrew 
Morton can pick it up for -mm, for wide dissemination, testing and 
review.  After that phase, it can be pushed to mainline.

The feeling I get from other kernel hackers is that you are demanding 
inclusion "now! now! now!" rather than giving all stakeholders a chance 
to give input, and let your design sink into the collective brain.

This isn't just an optional feature but a key new addition to the 
kernel.  So we should intentionally take more time and consideration 
than normal.  We don't want to go back and have to change fundamental 
kevent details due to design flaws, we want to get it right.

	Jeff




^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 10:41     ` Jeff Garzik
@ 2006-12-21 10:49       ` Evgeniy Polyakov
  2006-12-21 10:57         ` Evgeniy Polyakov
  2006-12-21 13:48         ` jamal
  0 siblings, 2 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 10:49 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: linux-kernel, David Miller, Ulrich Drepper, Andrew Morton,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck

On Thu, Dec 21, 2006 at 05:41:41AM -0500, Jeff Garzik (jeff@garzik.org) wrote:
> Evgeniy Polyakov wrote:
> >On Thu, Dec 21, 2006 at 12:14:17PM +0300, Evgeniy Polyakov 
> >(johnpol@2ka.mipt.ru) wrote:
> >>Generic event handling mechanism.
> >>
> >>Kevent is a generic subsytem which allows to handle event notifications.
> >>It supports both level and edge triggered events. It is similar to
> >>poll/epoll in some cases, but it is more scalable, it is faster and
> >>allows to work with essentially eny kind of events.
> >>
> >>Events are provided into kernel through control syscall and can be read
> >>back through ring buffer or using usual syscalls.
> >>Kevent update (i.e. readiness switching) happens directly from internals
> >>of the appropriate state machine of the underlying subsytem (like
> >>network, filesystem, timer or any other).
> >>
> >>Homepage:
> >>http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent
> >>
> >>Documentation page:
> >>http://linux-net.osdl.org/index.php/Kevent
> >>
> >>Consider for inclusion.
> >
> >Due to this stall kevent inclusion into lighttpd CVS tree is postponed.
> >
> >The last version will be released saturday or sunday, and looking into
> >overhelming flow of feedback comments on this feature, project will not
> >be released to linux-kernel@, after this I will
> >complete netchannels support and start kevent based AIO project - mostly
> >network AIO with new design, which is based on set of entities, which
> >can describe set of tasks which should be performed
> >asynchronously (from user point of view, although read and write
> >obviously must be done after open and before close), for example syscall
> 
> kevent is being considered for inclusion, but there is no need to get 
> impatient.  Once kevent code stops getting revised rapidly, Andrew 
> Morton can pick it up for -mm, for wide dissemination, testing and 
> review.  After that phase, it can be pushed to mainline.

I do not say 'hey, include it now, or I will cry', I just want to have
_some_ progress. But I do not get _any_ feedback. What should I think?
I doubt bothering people each third day with new resend is a good idea,
so I plan to drop it.

Btw, Andrew dropped 'take23' patchset from his tree, when it was
obsoleted, but did not import later versions :)

> The feeling I get from other kernel hackers is that you are demanding 
> inclusion "now! now! now!" rather than giving all stakeholders a chance 
> to give input, and let your design sink into the collective brain.

No. I do not want immediate inclusion, I want progress, so I could setup
my plans on it - if people keep silence, I stop this and work on my own
kevent goals, since I like current state of hte kevent for my tasks.

I do not hack for inclusion.

> This isn't just an optional feature but a key new addition to the 
> kernel.  So we should intentionally take more time and consideration 
> than normal.  We don't want to go back and have to change fundamental 
> kevent details due to design flaws, we want to get it right.

So comment on its bugs, its design, implementation, ask questions,
request features, show interest (even with 'I have no time right now,
but will loko at it after in a week after vacations').

No one does it, so no one cares, so my behaviour.

> 	Jeff

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 10:49       ` Evgeniy Polyakov
@ 2006-12-21 10:57         ` Evgeniy Polyakov
  2006-12-21 13:48         ` jamal
  1 sibling, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 10:57 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: linux-kernel, David Miller, Ulrich Drepper, Andrew Morton,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck

On Thu, Dec 21, 2006 at 01:49:18PM +0300, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> So comment on its bugs, its design, implementation, ask questions,
> request features, show interest (even with 'I have no time right now,
> but will loko at it after in a week after vacations').
> 
> No one does it, so no one cares, so my behaviour.

Btw, kevent will celebrate its first birtday (one year!) in two weeks
(Jan 6). Feel free to party!

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 10:49       ` Evgeniy Polyakov
  2006-12-21 10:57         ` Evgeniy Polyakov
@ 2006-12-21 13:48         ` jamal
  2006-12-21 14:04           ` Evgeniy Polyakov
  1 sibling, 1 reply; 76+ messages in thread
From: jamal @ 2006-12-21 13:48 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

Evgeniy,

On Thu, 2006-21-12 at 13:49 +0300, Evgeniy Polyakov wrote:

> So comment on its bugs, its design, implementation, ask questions,
> request features, show interest (even with 'I have no time right now,
> but will loko at it after in a week after vacations').
> 
> No one does it, so no one cares, so my behaviour.
> 

Please dont be discouraged by lack of attention - you are doing good
work.

I will concur with Jeff's point that since you are putting out a
profound conceptual changes, and there are many stake holders, it
requires scrutiny on their part. You need to build consensus in such a
situation. 
Some things that would help progress and build momentum:
- As i have advised you before, why dont you modify something like
existing libraries such as some of the loop thingies of desktop managers
such as kde/gnome or better things like libevent etc. Then write your
app on top of that? nobody is gonna run your httpd but if you
demonstrate that libevent is much better with your changes (with zero
changes to apps), people will migrate
- from a user space angle if people like Ulrich would state their views
on the current version you have. 
Note, they dont have to agree with you i.e the conclusion could be a
simple "agree to disagree".
- There really oughta be a limit on how long people are allowed to be
silent. After that IMO your code should just go in ...
 

cheers,
jamal


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 13:48         ` jamal
@ 2006-12-21 14:04           ` Evgeniy Polyakov
  2006-12-21 14:21             ` jamal
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 14:04 UTC (permalink / raw)
  To: jamal
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, Dec 21, 2006 at 08:48:05AM -0500, jamal (hadi@cyberus.ca) wrote:
> Evgeniy,
> 
> On Thu, 2006-21-12 at 13:49 +0300, Evgeniy Polyakov wrote:
> 
> > So comment on its bugs, its design, implementation, ask questions,
> > request features, show interest (even with 'I have no time right now,
> > but will loko at it after in a week after vacations').
> > 
> > No one does it, so no one cares, so my behaviour.
> > 
> 
> Please dont be discouraged by lack of attention - you are doing good
> work.
> 
> I will concur with Jeff's point that since you are putting out a
> profound conceptual changes, and there are many stake holders, it
> requires scrutiny on their part. You need to build consensus in such a
> situation. 
> Some things that would help progress and build momentum:
> - As i have advised you before, why dont you modify something like
> existing libraries such as some of the loop thingies of desktop managers
> such as kde/gnome or better things like libevent etc. Then write your
> app on top of that? nobody is gonna run your httpd but if you
> demonstrate that libevent is much better with your changes (with zero
> changes to apps), people will migrate
> - from a user space angle if people like Ulrich would state their views
> on the current version you have. 
> Note, they dont have to agree with you i.e the conclusion could be a
> simple "agree to disagree".
> - There really oughta be a limit on how long people are allowed to be
> silent. After that IMO your code should just go in ...

I modified world-wide used web server lighttpd and ran a lot of tests
with it (compared to epoll version with major performance win).

I was asked yesterday by Jan Kneschke (lighttpd main developer) if
kevent API is ready so he could include my patch into mainline lighttpd 
tree, but I answered 'I do not know if kevent will be or not included, 
everyone keeps silence'.

I just do not know _what_ else should be done not even for inclusion - 
but at least for some progress.

You want libevent to be patched? Its site is currently down, but ok, I
will create a patch.

> cheers,
> jamal

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 14:04           ` Evgeniy Polyakov
@ 2006-12-21 14:21             ` jamal
  2006-12-21 14:23               ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: jamal @ 2006-12-21 14:21 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, 2006-21-12 at 17:04 +0300, Evgeniy Polyakov wrote:

> I modified world-wide used web server lighttpd and ran a lot of tests
> with it (compared to epoll version with major performance win).
> 
> I was asked yesterday by Jan Kneschke (lighttpd main developer) if
> kevent API is ready so he could include my patch into mainline lighttpd 
> tree, but I answered 'I do not know if kevent will be or not included, 
> everyone keeps silence'.
> 

Ok, so you are building the momentum then. People like Jan are the users
of such API and should be speaking up. Then kernel people will be forced
to look at it.

> I just do not know _what_ else should be done not even for inclusion - 
> but at least for some progress.

I know you are frustrated but stop doing the above like a broken vinyl
record, it doesnt help your case. 

> You want libevent to be patched? Its site is currently down, but ok, I
> will create a patch.
> 

I promise in 2 weeks to migrate an app or two that i have that use
libevent to your version if you have it by then. I will then test and
give you my opinion. 

cheers,
jamal


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 14:21             ` jamal
@ 2006-12-21 14:23               ` Evgeniy Polyakov
  2006-12-21 14:36                 ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 14:23 UTC (permalink / raw)
  To: jamal
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, Dec 21, 2006 at 09:21:07AM -0500, jamal (hadi@cyberus.ca) wrote:
> > I just do not know _what_ else should be done not even for inclusion - 
> > but at least for some progress.
> 
> I know you are frustrated but stop doing the above like a broken vinyl
> record, it doesnt help your case. 

That's why I'm going to stop void resending completely.

> > You want libevent to be patched? Its site is currently down, but ok, I
> > will create a patch.
> > 
> 
> I promise in 2 weeks to migrate an app or two that i have that use
> libevent to your version if you have it by then. I will then test and
> give you my opinion. 

Ok, when site will be ready I will patch libevent and post patch or link
in this thread. I plan to complete it this week.

> cheers,
> jamal

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 14:23               ` Evgeniy Polyakov
@ 2006-12-21 14:36                 ` Evgeniy Polyakov
  2006-12-21 14:40                   ` jamal
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 14:36 UTC (permalink / raw)
  To: jamal
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, Dec 21, 2006 at 05:23:37PM +0300, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> Ok, when site will be ready I will patch libevent and post patch or link
> in this thread. I plan to complete it this week.

Btw, it uses only read/write/signal on fd events, so it must use
->poll() and thus be as fast as epoll. 

Things like sockets/pipes can only benefit from direct kevent usage 
instead of ->poll() and wrappers.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 14:36                 ` Evgeniy Polyakov
@ 2006-12-21 14:40                   ` jamal
  2006-12-21 14:46                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: jamal @ 2006-12-21 14:40 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, 2006-21-12 at 17:36 +0300, Evgeniy Polyakov wrote:

> Btw, it uses only read/write/signal on fd events, so it must use
> ->poll() and thus be as fast as epoll. 
> 

It is supposed to "detect" the best mechanism in the kernel and switch
to that.
At the moment for example in my app it defaults to epoll.

> Things like sockets/pipes can only benefit from direct kevent usage 
> instead of ->poll() and wrappers.

You should be able change it to use those schemes when it detects
that the kernel supports them.

cheers,
jamal


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 14:40                   ` jamal
@ 2006-12-21 14:46                     ` Evgeniy Polyakov
  2006-12-21 16:42                       ` jamal
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 14:46 UTC (permalink / raw)
  To: jamal
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, Dec 21, 2006 at 09:40:26AM -0500, jamal (hadi@cyberus.ca) wrote:
> > Things like sockets/pipes can only benefit from direct kevent usage 
> > instead of ->poll() and wrappers.
> 
> You should be able change it to use those schemes when it detects
> that the kernel supports them.

I.e. stat() for each new file descriptor - note, that _you_ asked it :)

> cheers,
> jamal

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 14:46                     ` Evgeniy Polyakov
@ 2006-12-21 16:42                       ` jamal
  2006-12-21 16:51                         ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: jamal @ 2006-12-21 16:42 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, 2006-21-12 at 17:46 +0300, Evgeniy Polyakov wrote:
> On Thu, Dec 21, 2006 at 09:40:26AM -0500, jamal (hadi@cyberus.ca) wrote:
> > > Things like sockets/pipes can only benefit from direct kevent usage 
> > > instead of ->poll() and wrappers.
> > 
> > You should be able change it to use those schemes when it detects
> > that the kernel supports them.
> 
> I.e. stat() for each new file descriptor - note, that _you_ asked it :)

Didnt follow. Is there some issue with libevent you mean? 

cheers,
jamal



^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take28-resend_1->0 0/8] kevent: Generic event handling mechanism.
  2006-12-21 16:42                       ` jamal
@ 2006-12-21 16:51                         ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-21 16:51 UTC (permalink / raw)
  To: jamal
  Cc: Jeff Garzik, linux-kernel, David Miller, Ulrich Drepper,
	Andrew Morton, netdev, Zach Brown, Christoph Hellwig,
	Chase Venters, Johann Borck

On Thu, Dec 21, 2006 at 11:42:04AM -0500, jamal (hadi@cyberus.ca) wrote:
> > > > Things like sockets/pipes can only benefit from direct kevent usage 
> > > > instead of ->poll() and wrappers.
> > > 
> > > You should be able change it to use those schemes when it detects
> > > that the kernel supports them.
> > 
> > I.e. stat() for each new file descriptor - note, that _you_ asked it :)
> 
> Didnt follow. Is there some issue with libevent you mean? 

libevent provides file descriptor without any additional info about it -
so when it is added into the waiting subsystem, userspace must select
different usage cases (i.e. different kevent notifications for different
types of file descriptor - socket notifications for sockets and pipes,
poll/select for all others), this requires stat() call per provided file
descriptor.

Event addition/waiting itself is the same - only parameters (type and
requested event) are changed.

> cheers,
> jamal

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 0/8] kevent: Generic event handling mechanism.
       [not found] <3154985aa0591036@2ka.mipt.ru>
  2006-12-17 13:53 ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
  2006-12-21  9:14 ` [take28-resend_1->0 " Evgeniy Polyakov
@ 2006-12-23 16:51 ` Evgeniy Polyakov
  2006-12-23 16:51   ` [take29 1/8] kevent: Description Evgeniy Polyakov
                     ` (3 more replies)
  2006-12-29 12:25 ` [take30 0/9] " Evgeniy Polyakov
  2007-01-08 19:25 ` [take31 0/10] kevent: Generic event handling mechanism Evgeniy Polyakov
  4 siblings, 4 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Generic event handling mechanism.

Kevent is a generic subsytem which allows to handle event notifications.
It supports both level and edge triggered events. It is similar to
poll/epoll in some cases, but it is more scalable, it is faster and
allows to work with essentially eny kind of events.

Events are provided into kernel through control syscall and can be read
back through ring buffer or using usual syscalls.
Kevent update (i.e. readiness switching) happens directly from internals
of the appropriate state machine of the underlying subsytem (like
network, filesystem, timer or any other).

Homepage:
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Documentation page:
http://linux-net.osdl.org/index.php/Kevent

Consider for inclusion.

New benchmark, which can be a hoax though, can be found at 
http://tservice.net.ru/~s0mbre/blog/2006/11/30#2006_11_30
where kevent on amd64 with 1gb of ram can handle more than 7200 events per 
second with 8000 requests concurrency with 'ab' benchmark and lighttpd.
Although I tought it should not be published due to possible errors,
I decided to send it for review.

With this release I start 3 days resending timeout - i.e. each third day I 
will send either new version (if something new was requested and agreed to 
be implemented) or resending with back counter started from three. 
When back counter hits zero after three resending I consider there is no 
interest in subsystem and I will stop further sending.
First one will be sent about Jan 10.

Thanks for understanding and your time.

Changes from 'take28' patchset:
 * optimized af_unix to use socket notifications
 * changed ALWAYS_QUEUE behaviour with poll/select notifications - previously
 	kevent was not queued into poll wait queue when ALWAYS_QUEUE flag
	is set
 * added KEVENT_POLL_POLLRDHUP definition into ukevent.h header
 * libevent-1.2 patch (Jamal, your request is completed, so I'm waiting two weeks
 	before starting final countdown :)
	All regression tests passed successfully except test_evbuffer(), which
	crashes on my amd64 linux 2.6 test machine for all types of notifications,
	probably it is fixed in libevent-1.2a version, I did not check.
	Patch and README can be found at project homepage.

Changes from 'take27' patchset:
 * made kevent default yes in non embedded case.
 * added falgs to callback structures - currently used to check if kevent
 	can be requested from kernelspace only (posix timers) or 
	userspace (all others)

Changes from 'take26' patchset:
 * made kevent visible in config only in case of embedded setup.
 * added comment about KEVENT_MAX number.
 * spell fix.

Changes from 'take25' patchset:
 * use timespec as timeout parameter.
 * added high-resolution timer to handle absolute timeouts.
 * added flags to waiting and initialization syscalls.
 * kevent_commit() has new_uidx parameter.
 * kevent_wait() has old_uidx parameter, which, if not equal to u->uidx,
 	results in immediate wakeup (usefull for the case when entries
	are added asynchronously from kernel (not supported for now)).
 * added interface to mark any event as ready.
 * event POSIX timers support.
 * return -ENOSYS if there is no registered event type.
 * provided file descriptor must be checked for fifo type (spotted by Eric Dumazet).
 * signal notifications.
 * documentation update.
 * lighttpd patch updated (the latest benchmarks with lighttpd patch can be found in blog).

Changes from 'take24' patchset:
 * new (old (new)) ring buffer implementation with kernel and user indexes.
 * added initialization syscall instead of opening /dev/kevent
 * kevent_commit() syscall to commit ring buffer entries
 * changed KEVENT_REQ_WAKEUP_ONE flag to KEVENT_REQ_WAKEUP_ALL, kevent wakes
   only first thread always if that flag is not set
 * KEVENT_REQ_ALWAYS_QUEUE flag. If set, kevent will be queued into ready queue
   instead of copying back to userspace when kevent is ready immediately when
   it is added.
 * lighttpd patch (Hail! Although nothing really outstanding compared to epoll)

Changes from 'take23' patchset:
 * kevent PIPE notifications
 * KEVENT_REQ_LAST_CHECK flag, which allows to perform last check at dequeueing time
 * fixed poll/select notifications (were broken due to tree manipulations)
 * made Documentation/kevent.txt look nice in 80-col terminal
 * fix for copy_to_user() failure report for the first kevent (Andrew Morton)
 * minor function renames

Changes from 'take22' patchset:
 * new ring buffer implementation in process' memory
 * wakeup-one-thread flag
 * edge-triggered behaviour

Changes from 'take21' patchset:
 * minor cleanups (different return values, removed unneded variables, whitespaces and so on)
 * fixed bug in kevent removal in case when kevent being removed
   is the same as overflow_kevent (spotted by Eric Dumazet)

Changes from 'take20' patchset:
 * new ring buffer implementation
 * removed artificial limit on possible number of kevents

Changes from 'take19' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take18' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take17' patchset:
 * Use RB tree instead of hash table. 
	At least for a web sever, frequency of addition/deletion of new kevent 
	is comparable with number of search access, i.e. most of the time events 
	are added, accesed only couple of times and then removed, so it justifies 
	RB tree usage over AVL tree, since the latter does have much slower deletion 
	time (max O(log(N)) compared to 3 ops), 
	although faster search time (1.44*O(log(N)) vs. 2*O(log(N))). 
	So for kevents I use RB tree for now and later, when my AVL tree implementation 
	is ready, it will be possible to compare them.
 * Changed readiness check for socket notifications.

With both above changes it is possible to achieve more than 3380 req/second compared to 2200, 
sometimes 2500 req/second for epoll() for trivial web-server and httperf client on the same
hardware.
It is possible that above kevent limit is due to maximum allowed kevents in a time limit, which is
4096 events.

Changes from 'take16' patchset:
 * misc cleanups (__read_mostly, const ...)
 * created special macro which is used for mmap size (number of pages) calculation
 * export kevent_socket_notify(), since it is used in network protocols which can be 
	built as modules (IPv6 for example)

Changes from 'take15' patchset:
 * converted kevent_timer to high-resolution timers, this forces timer API update at
	http://linux-net.osdl.org/index.php/Kevent
 * use struct ukevent* instead of void * in syscalls (documentation has been updated)
 * added warning in kevent_add_ukevent() if ring has broken index (for testing)

Changes from 'take14' patchset:
 * added kevent_wait()
    This syscall waits until either timeout expires or at least one event
    becomes ready. It also commits that @num events from @start are processed
    by userspace and thus can be be removed or rearmed (depending on it's flags).
    It can be used for commit events read by userspace through mmap interface.
    Example userspace code (evtest.c) can be found on project's homepage.
 * added socket notifications (send/recv/accept)

Changes from 'take13' patchset:
 * do not get lock aroung user data check in __kevent_search()
 * fail early if there were no registered callbacks for given type of kevent
 * trailing whitespace cleanup

Changes from 'take12' patchset:
 * remove non-chardev interface for initialization
 * use pointer to kevent_mring instead of unsigned longs
 * use aligned 64bit type in raw user data (can be used by high-res timer if needed)
 * simplified enqueue/dequeue callbacks and kevent initialization
 * use nanoseconds for timeout
 * put number of milliseconds into timer's return data
 * move some definitions into user-visible header
 * removed filenames from comments

Changes from 'take11' patchset:
 * include missing headers into patchset
 * some trivial code cleanups (use goto instead of if/else games and so on)
 * some whitespace cleanups
 * check for ready_callback() callback before main loop which should save us some ticks

Changes from 'take10' patchset:
 * removed non-existent prototypes
 * added helper function for kevent_registered_callbacks
 * fixed 80 lines comments issues
 * added shared between userspace and kernelspace header instead of embedd them in one
 * core restructuring to remove forward declarations
 * s o m e w h i t e s p a c e c o d y n g s t y l e c l e a n u p
 * use vm_insert_page() instead of remap_pfn_range()

Changes from 'take9' patchset:
 * fixed ->nopage method

Changes from 'take8' patchset:
 * fixed mmap release bug
 * use module_init() instead of late_initcall()
 * use better structures for timer notifications

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was
learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from
userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 2/8] kevent: Core files.
  2006-12-23 16:51   ` [take29 1/8] kevent: Description Evgeniy Polyakov
@ 2006-12-23 16:51     ` Evgeniy Polyakov
  2006-12-23 16:51       ` [take29 3/8] kevent: poll/select() notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Core files.

This patch includes core kevent files:
 * userspace controlling
 * kernelspace interfaces
 * initialization
 * notification state machines

Some bits of documentation can be found on project's homepage (and links from there):
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 7e639f7..a6221c2 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -318,3 +318,8 @@ ENTRY(sys_call_table)
 	.long sys_vmsplice
 	.long sys_move_pages
 	.long sys_getcpu
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl		/* 320 */
+	.long sys_kevent_wait
+	.long sys_kevent_commit
+	.long sys_kevent_init
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index b4aa875..dda2168 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -714,8 +714,13 @@ ia32_sys_call_table:
 	.quad compat_sys_get_robust_list
 	.quad sys_splice
 	.quad sys_sync_file_range
-	.quad sys_tee
+	.quad sys_tee			/* 315 */
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
 	.quad sys_getcpu
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl		/* 320 */
+	.quad sys_kevent_wait
+	.quad sys_kevent_commit
+	.quad sys_kevent_init
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index bd99870..57a6b8c 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -324,10 +324,15 @@
 #define __NR_vmsplice		316
 #define __NR_move_pages		317
 #define __NR_getcpu		318
+#define __NR_kevent_get_events	319
+#define __NR_kevent_ctl		320
+#define __NR_kevent_wait	321
+#define __NR_kevent_commit	322
+#define __NR_kevent_init	323
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 319
+#define NR_syscalls 324
 #include <linux/err.h>
 
 /*
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 6137146..17d750d 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,20 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait	282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
+#define __NR_kevent_commit	283
+__SYSCALL(__NR_kevent_commit, sys_kevent_commit)
+#define __NR_kevent_init	284
+__SYSCALL(__NR_kevent_init, sys_kevent_init)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_init
 #include <linux/err.h>
 
 #ifndef __NO_STUBS
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..2c31df3
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,244 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
+#include <linux/ukevent.h>
+
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+/* @flags - flags for given set of callbacks. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+	unsigned int		flags;
+};
+
+#define KEVENT_CALLBACKS_KERNELONLY	0x1
+
+int kevent_event_is_allowed(struct ukevent *e);
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	/* Used for kevent freeing.*/
+	struct rcu_head		rcu_head;
+	struct ukevent		event;
+	/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+	spinlock_t		ulock;
+
+	/* Entry of user's tree. */
+	struct rb_node		kevent_node;
+	/* Entry of origin's queue. */
+	struct list_head	storage_entry;
+	/* Entry of user's ready. */
+	struct list_head	ready_entry;
+
+	u32			flags;
+
+	/* User who requested this kevent. */
+	struct kevent_user	*user;
+	/* Kevent container. */
+	struct kevent_storage	*st;
+
+	struct kevent_callbacks	callbacks;
+
+	/* Private data for different storages.
+	 * poll()/select storage has a list of wait_queue_t containers
+	 * for each ->poll() { poll_wait()' } here.
+	 */
+	void			*priv;
+};
+
+struct kevent_user
+{
+	struct rb_root		kevent_root;
+	spinlock_t		kevent_lock;
+	/* Number of queued kevents. */
+	unsigned int		kevent_num;
+
+	/* List of ready kevents. */
+	struct list_head	ready_list;
+	/* Number of ready kevents. */
+	unsigned int		ready_num;
+	/* Protects all manipulations with ready queue. */
+	spinlock_t 		ready_lock;
+
+	/* Protects against simultaneous kevent_user control manipulations. */
+	struct mutex		ctl_mutex;
+	/* Wait until some events are ready. */
+	wait_queue_head_t	wait;
+	/* Exit from syscall if someone wants us to do it */
+	int			need_exit;
+
+	/* Reference counter, increased for each new kevent. */
+	atomic_t		refcnt;
+
+	/* Mutex protecting userspace ring buffer. */
+	struct mutex		ring_lock;
+	/* Kernel index and size of the userspace ring buffer. */
+	unsigned int		kidx, uidx, ring_size, ring_over, full;
+	/* Pointer to userspace ring buffer. */
+	struct kevent_ring __user *pring;
+
+	/* Is used for absolute waiting times. */
+	struct hrtimer		timer;
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num, ring_num;
+	unsigned long		total;
+#endif
+};
+
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos);
+
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+void kevent_ready(struct kevent *k, int ret);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = u->ring_num = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	printk(KERN_INFO "%s: u: %p, wait: %lu, ring: %lu, immediately: %lu, total: %lu.\n",
+			__func__, u, u->wait_num, u->ring_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_ring(struct kevent_user *u)
+{
+	u->ring_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_ring(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+#ifdef CONFIG_KEVENT_SOCKET
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_POLL
+static inline void kevent_init_file(struct file *file)
+{
+	kevent_storage_init(file, &file->st);
+}
+
+static inline void kevent_cleanup_file(struct file *file)
+{
+	kevent_storage_fini(&file->st);
+}
+#else
+static inline void kevent_init_file(struct file *file) {}
+static inline void kevent_cleanup_file(struct file *file) {}
+#endif
+
+#ifdef CONFIG_KEVENT_PIPE
+extern void kevent_pipe_notify(struct inode *inode, u32 events);
+#else
+static inline void kevent_pipe_notify(struct inode *inode, u32 events) {}
+#endif
+
+#ifdef CONFIG_KEVENT_SIGNAL
+extern int kevent_signal_notify(struct task_struct *tsk, int sig);
+#else
+static inline int kevent_signal_notify(struct task_struct *tsk, int sig) {return 0;}
+#endif
+
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2d1c3d5..7574ec3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,8 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct ukevent;
+struct kevent_ring;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -599,4 +601,11 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, struct ukevent __user *buf);
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags);
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over);
+asmlinkage long sys_kevent_init(int ctl_fd, struct kevent_ring __user *ring, unsigned int num, unsigned int flags);
 #endif
diff --git a/include/linux/ukevent.h b/include/linux/ukevent.h
new file mode 100644
index 0000000..e59dda7
--- /dev/null
+++ b/include/linux/ukevent.h
@@ -0,0 +1,186 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __UKEVENT_H
+#define __UKEVENT_H
+
+#include <linux/types.h>
+
+/*
+ * Kevent request flags.
+ */
+
+/* Process this event only once and then remove it. */
+#define KEVENT_REQ_ONESHOT	0x1
+/* Kevent wakes up only first thread interested in given event,
+ * or all threads if this flag is set.
+ */
+#define KEVENT_REQ_WAKEUP_ALL	0x2
+/* Edge Triggered behaviour. */
+#define KEVENT_REQ_ET		0x4
+/* Perform the last check on kevent (call appropriate callback) when
+ * kevent is marked as ready and has been removed from ready queue.
+ * If it will be confirmed that kevent is ready 
+ * (k->callbacks.callback(k) returns true) then kevent will be copied
+ * to userspace, otherwise it will be requeued back to storage. 
+ * Second (checking) call is performed with this bit _cleared_ so
+ * callback can detect when it was called from 
+ * kevent_storage_ready() - bit is set, or 
+ * kevent_dequeue_ready() - bit is cleared. 
+ * If kevent will be requeued, bit will be set again. */
+#define KEVENT_REQ_LAST_CHECK	0x8
+/*
+ * Always queue kevent even if it is immediately ready.
+ */
+#define KEVENT_REQ_ALWAYS_QUEUE	0x10
+
+/*
+ * Kevent return flags.
+ */
+/* Kevent is broken. */
+#define KEVENT_RET_BROKEN	0x1
+/* Kevent processing was finished successfully. */
+#define KEVENT_RET_DONE		0x2
+/* Kevent was not copied into ring buffer due to some error conditions. */
+#define KEVENT_RET_COPY_FAILED	0x4
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define KEVENT_PIPE		6
+#define KEVENT_SIGNAL		7
+#define KEVENT_POSIX_TIMER	8
+
+/* Used as array size, so must be equal to the last KEVENT type + 1 */
+#define	KEVENT_MAX		9
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO and PIPE events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+#define KEVENT_POLL_POLLRDHUP	0x2000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+/*
+ * Signal events.
+ */
+#define KEVENT_SIGNAL_DELIVERY		0x1
+
+/* If set in raw64, then given signals will not be delivered
+ * in a usual way through sigmask update and signal callback 
+ * invocation. */
+#define KEVENT_SIGNAL_NOMASK	0x8000000000000000ULL
+
+/* Mask of all possible event values. */
+#define KEVENT_MASK_ALL		0xffffffff
+/* Empty mask of ready events. */
+#define KEVENT_MASK_EMPTY	0x0
+
+struct kevent_id
+{
+	union {
+		__u32		raw[2];
+		__u64		raw_u64 __attribute__((aligned(8)));
+	};
+};
+
+struct ukevent
+{
+	/* Id of this request, e.g. socket number, file descriptor and so on... */
+	struct kevent_id	id;
+	/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			type;
+	/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			event;
+	/* Per-event request flags */
+	__u32			req_flags;
+	/* Per-event return flags */
+	__u32			ret_flags;
+	/* Event return data. Event originator fills it with anything it likes. */
+	__u32			ret_data[2];
+	/* User's data. It is not used, just copied to/from user.
+	 * The whole structure is aligned to 8 bytes already, so the last union
+	 * is aligned properly.
+	 */
+	union {
+		__u32		user[2];
+		void		*ptr;
+	};
+};
+
+struct kevent_ring
+{
+	unsigned int		ring_kidx, ring_over;
+	struct ukevent		event[0];
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_READY	3
+
+/* Provided timespec parameter uses absolute time, i.e. 'wait until Aug 27, 2194' */
+#define KEVENT_FLAGS_ABSTIME	1
+
+#endif /* __UKEVENT_H */
diff --git a/init/Kconfig b/init/Kconfig
index d2eb7a8..c7d8250 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -201,6 +201,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	bool "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index d62ec66..2d7a6dd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..c6cdb0a
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,66 @@
+config KEVENT
+	bool "Kernel event notification mechanism" if EMBEDDED
+	default y
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback
+	  invocations, advanced timer notifications and other kernel
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents
+	  which are ready immediately at insertion time and number of kevents
+	  which were removed through readiness completion.
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use kevent subsystem for poll()/select()
+	  notifications.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+
+config KEVENT_PIPE
+	bool "Kernel event notifications for pipes"
+	depends on KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  pipe read/write operations.
+
+config KEVENT_SIGNAL
+	bool "Kernel event notifications for signals"
+	depends on KEVENT
+	default y
+	help
+	  This option enables signal delivery through KEVENT subsystem.
+	  Signals which were requested to be delivered through kevent
+	  subsystem must be registered through usual signal() and others
+	  syscalls, this option allows alternative delivery.
+	  With KEVENT_SIGNAL_NOMASK flag being set in kevent for set of 
+	  signals, they will not be delivered in a usual way.
+	  Kevents for appropriate signals are not copied when process forks,
+	  new process must add new kevents after fork(). Mask of signals
+	  is copied as before.
+
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..f98e0c8
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,6 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_PIPE) += kevent_pipe.o
+obj-$(CONFIG_KEVENT_SIGNAL) += kevent_signal.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..1781c57
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,265 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return -EINVAL;
+}
+
+static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX] __read_mostly;
+
+int kevent_event_is_allowed(struct ukevent *e)
+{
+	if (unlikely(e->type >= KEVENT_MAX))
+		return 0;
+
+	if (!kevent_registered_callbacks[e->type].callback)
+		return 0;
+
+	if (unlikely(kevent_registered_callbacks[e->type].callback == kevent_break))
+		return 0;
+
+	if (kevent_registered_callbacks[e->type].flags & KEVENT_CALLBACKS_KERNELONLY)
+		return 0;
+	
+	return 1;
+}
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos)
+{
+	struct kevent_callbacks *p;
+
+	if (pos >= KEVENT_MAX)
+		return -EINVAL;
+
+	p = &kevent_registered_callbacks[pos];
+
+	p->enqueue = (cb->enqueue) ? cb->enqueue : kevent_break;
+	p->dequeue = (cb->dequeue) ? cb->dequeue : kevent_break;
+	p->callback = (cb->callback) ? cb->callback : kevent_break;
+	p->flags = cb->flags;
+
+	printk(KERN_INFO "KEVENT: Added callbacks for type %d.\n", pos);
+	return 0;
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (unlikely(k->event.type >= KEVENT_MAX)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	if (!kevent_registered_callbacks[k->event.type].callback) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (unlikely(k->callbacks.callback == kevent_break)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue.
+ * It does not decrease origin's reference counter in any way
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+void kevent_ready(struct kevent *k, int ret)
+{
+	unsigned long flags;
+	int rem;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0)
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	else if (ret < 0)
+		k->event.ret_flags |= (KEVENT_RET_BROKEN | KEVENT_RET_DONE);
+	else
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && (k->flags & KEVENT_STORAGE)) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static int __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret;
+
+	ret = k->callbacks.callback(k);
+
+	kevent_ready(k, ret);
+
+	return ret;
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+	int wake_num = 0;
+
+	rcu_read_lock();
+	if (unlikely(ready_callback))
+		list_for_each_entry_rcu(k, &st->list, storage_entry)
+			(*ready_callback)(k);
+
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (event & k->event.event)
+			if ((k->event.req_flags & KEVENT_REQ_WAKEUP_ALL) || wake_num == 0)
+				if (__kevent_requeue(k, event))
+					wake_num++;
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, socket and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..2ed628d
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1363 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static kmem_cache_t *kevent_cache __read_mostly;
+static kmem_cache_t *kevent_user_cache __read_mostly;
+
+static int kevent_debug_abstime;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num || u->need_exit)
+		mask |= POLLIN | POLLRDNORM;
+	u->need_exit = 0;
+
+	return mask;
+}
+
+static inline unsigned int kevent_ring_space(struct kevent_user *u)
+{
+	if (!u->pring)
+		return 1;
+
+	if (u->full)
+		return 0;
+
+	return (u->uidx > u->kidx)?
+		(u->uidx - u->kidx):
+		(u->ring_size - (u->kidx - u->uidx));
+}
+
+static inline int kevent_ring_index_inc(unsigned int *pidx, unsigned int size)
+{
+	unsigned int idx = *pidx;
+
+	if (++idx >= size)
+		idx = 0;
+	*pidx = idx;
+	return (idx == 0);
+}
+
+/*
+ * Copies kevent into userspace ring buffer if it was initialized.
+ * Returns 
+ *  0 on success or if ring buffer is not used
+ *  -EAGAIN if there were no place for that kevent
+ *  -EFAULT if copy_to_user() failed.
+ *
+ *  Must be called under kevent_user->ring_lock locked.
+ */
+static int kevent_copy_ring_buffer(struct kevent *k)
+{
+	struct kevent_ring __user *ring;
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+	int err;
+
+	ring = u->pring;
+	if (!ring)
+		return 0;
+
+	if (!kevent_ring_space(u))
+		return -EAGAIN;
+
+	if (copy_to_user(&ring->event[u->kidx], &k->event, sizeof(struct ukevent))) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	kevent_ring_index_inc(&u->kidx, u->ring_size);
+
+	if (u->kidx == u->uidx)
+		u->full = 1;
+
+	if (put_user(u->kidx, &ring->ring_kidx)) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return err;
+}
+
+static struct kevent_user *kevent_user_alloc(struct kevent_ring __user *ring, unsigned int num)
+{
+	struct kevent_user *u;
+
+	u = kmem_cache_alloc(kevent_user_cache, GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	u->kevent_root = RB_ROOT;
+
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+	u->need_exit = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	mutex_init(&u->ring_lock);
+	u->kidx = u->uidx = u->ring_over = u->full = 0;
+
+	u->pring = ring;
+	u->ring_size = num;
+
+	hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+
+	return u;
+}
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		hrtimer_cancel(&u->timer);
+		kmem_cache_free(kevent_user_cache, u);
+	}
+}
+
+static inline int kevent_compare_id(struct kevent_id *left, struct kevent_id *right)
+{
+	if (left->raw_u64 > right->raw_u64)
+		return -1;
+
+	if (right->raw_u64 > left->raw_u64)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Must be called under u->ready_lock.
+ * This function unlinks kevent from ready queue.
+ */
+static inline void kevent_unlink_ready(struct kevent *k)
+{
+	list_del(&k->ready_entry);
+	k->flags &= ~KEVENT_READY;
+	k->user->ready_num--;
+}
+
+static void kevent_remove_ready(struct kevent *k)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY)
+		kevent_unlink_ready(k);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	if (deq)
+		kevent_dequeue(k);
+
+	kevent_remove_ready(k);
+
+	kevent_user_put(k->user);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events,
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+static struct kevent *__kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	if (u->ready_num) {
+		spin_lock_irqsave(&u->ready_lock, flags);
+		if (u->ready_num && !list_empty(&u->ready_list)) {
+			k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+			kevent_unlink_ready(k);
+		}
+		spin_unlock_irqrestore(&u->ready_lock, flags);
+	}
+
+	return k;
+}
+
+static struct kevent *kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	while (u->ready_num && !k) {
+		k = __kevent_dequeue_ready_one(u);
+
+		if (k && (k->event.req_flags & KEVENT_REQ_LAST_CHECK)) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&k->ulock, flags);
+			k->event.req_flags &= ~KEVENT_REQ_LAST_CHECK;
+			spin_unlock_irqrestore(&k->ulock, flags);
+
+			if (!k->callbacks.callback(k)) {
+				spin_lock_irqsave(&k->ulock, flags);
+				k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+				k->event.ret_flags = 0;
+				k->event.ret_data[0] = k->event.ret_data[1] = 0;
+				spin_unlock_irqrestore(&k->ulock, flags);
+				k = NULL;
+			}
+		} else
+			break;
+	}
+
+	return k;
+}
+
+static inline void kevent_copy_ring(struct kevent *k)
+{
+	unsigned long flags;
+
+	if (!k)
+		return;
+
+	if (kevent_copy_ring_buffer(k)) {
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kevent_dequeue_ready(struct kevent_user *u)
+{
+	struct kevent *k;
+
+	mutex_lock(&u->ring_lock);
+	k = kevent_dequeue_ready_one(u);
+	kevent_copy_ring(k);
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+/*
+ * Dequeue one entry from user's ready queue if there is space in ring buffer.
+ */
+static struct kevent *kevent_dequeue_ready_ring(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	mutex_lock(&u->ring_lock);
+	if (kevent_ring_space(u)) {
+		k = kevent_dequeue_ready_one(u);
+		kevent_copy_ring(k);
+	}
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+static void kevent_complete_ready(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		kevent_finish_user(k, 1);
+	else if (k->event.req_flags & KEVENT_REQ_ET) {
+		unsigned long flags;
+
+		/*
+		 * Edge-triggered behaviour: mark event as clear new one.
+		 */
+
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags = 0;
+		k->event.ret_data[0] = k->event.ret_data[1] = 0;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Search a kevent inside kevent tree for given ukevent.
+ */
+static struct kevent *__kevent_search(struct kevent_id *id, struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	struct rb_node *n = u->kevent_root.rb_node;
+	int cmp;
+
+	while (n) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		cmp = kevent_compare_id(&k->event.id, id);
+
+		if (cmp > 0)
+			n = n->rb_right;
+		else if (cmp < 0)
+			n = n->rb_left;
+		else {
+			ret = k;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k;
+	struct rb_node *n;
+
+	for (n = rb_first(&u->kevent_root); n; n = rb_next(n)) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_mark_ready(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, u);
+	if (k) {
+		spin_lock(&k->st->lock);
+		kevent_ready(k, 1);
+		spin_unlock(&k->st->lock);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Mark appropriate kevents as ready.
+ * If number of events is zero just wake up one listener.
+ */
+static int kevent_user_ctl_ready(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	if (num > u->kevent_num)
+		return err;
+	
+	if (!num) {
+		u->need_exit = 1;
+		wake_up(&u->wait);
+		return 0;
+	}
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				err = kevent_mark_ready(&ukev[i], u);
+				if (err) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_mark_ready(&uk, u);
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = num - rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static int kevent_user_enqueue(struct kevent_user *u, struct kevent *new)
+{
+	unsigned long flags;
+	struct rb_node **p = &u->kevent_root.rb_node, *parent = NULL;
+	struct kevent *k;
+	int err = 0, cmp;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	while (*p) {
+		parent = *p;
+		k = rb_entry(parent, struct kevent, kevent_node);
+
+		cmp = kevent_compare_id(&k->event.id, &new->event.id);
+		if (cmp > 0)
+			p = &parent->rb_right;
+		else if (cmp < 0)
+			p = &parent->rb_left;
+		else {
+			err = -EEXIST;
+			break;
+		}
+	}
+	if (likely(!err)) {
+		rb_link_node(&new->kevent_node, parent, p);
+		rb_insert_color(&new->kevent_node, &u->kevent_root);
+		new->flags |= KEVENT_USER;
+		u->kevent_num++;
+		kevent_user_get(u);
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	err = kevent_user_enqueue(u, k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		kevent_finish_user(k, 0);
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	if (err < 0) {
+		uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE;
+		uk->ret_data[1] = err;
+	} else if (err > 0)
+		uk->ret_flags |= KEVENT_RET_DONE;
+	return err;
+}
+
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one
+ * and add them into appropriate kevent_storages,
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (!kevent_event_is_allowed(&ukev[i])) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], 
+							&ukev[i], 
+							sizeof(struct ukevent));
+					rnum++;
+				} else {
+					err = kevent_user_add_ukevent(&ukev[i], u);
+					if (err) {
+						kevent_stat_im(u);
+						if (i != rnum)
+							memcpy(&ukev[rnum], 
+								&ukev[i], 
+								sizeof(struct ukevent));
+						rnum++;
+					}
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		if (!kevent_event_is_allowed(&uk)) {
+			err = 1;
+		} else {
+			err = kevent_user_add_ukevent(&uk, u);
+			if (err)
+				kevent_stat_im(u);
+		}
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/* Used to wakeup waiting syscalls in case high-resolution timer is used. */
+static int kevent_user_wake(struct hrtimer *timer)
+{
+	struct kevent_user *u = container_of(timer, struct kevent_user, timer);
+
+	u->need_exit = 1;
+	wake_up(&u->wait);
+
+	return HRTIMER_NORESTART;
+}
+
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u,
+		unsigned int min_nr, unsigned int max_nr, struct timespec timeout,
+		void __user *buf, unsigned int flags)
+{
+	struct kevent *k;
+	int num = 0;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			return -EINVAL;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || u->need_exit, tm);
+	}
+	u->need_exit = 0;
+
+	while (num < max_nr && ((k = kevent_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent),
+					&k->event, sizeof(struct ukevent))) {
+			if (num == 0)
+				num = -EFAULT;
+			break;
+		}
+		kevent_complete_ready(k);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+struct file_operations kevent_user_fops = {
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	case KEVENT_CTL_READY:
+		err = kevent_user_ctl_ready(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - time to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf, flags);
+out_fput:
+	fput(file);
+	return err;
+}
+
+static struct vfsmount *kevent_mnt __read_mostly;
+
+static int kevent_get_sb(struct file_system_type *fs_type, int flags,
+		   const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "kevent", NULL, 0xaabbccdd, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= "keventfs",
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int keventfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations keventfs_dentry_operations = {
+	.d_delete	= keventfs_delete_dentry,
+};
+
+asmlinkage long sys_kevent_init(struct kevent_ring __user *ring, unsigned int num, unsigned int flags)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int err = -ENFILE, fd;
+	struct kevent_user *u;
+
+	if ((ring && !num) || (!ring && num) || (num == 1))
+		return -EINVAL;
+
+	file = get_empty_filp();
+	if (!file)
+		goto err_out_exit;
+
+	inode = new_inode(kevent_mnt->mnt_sb);
+	if (!inode)
+		goto err_out_fput;
+
+	inode->i_fop = &kevent_user_fops;
+
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	err = get_unused_fd();
+	if (err < 0)
+		goto err_out_iput;
+	fd = err;
+
+	err = -ENOMEM;
+	u = kevent_user_alloc(ring, num);
+	if (!u)
+		goto err_out_put_fd;
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(kevent_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_out_free;
+	dentry->d_op = &keventfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_pos = 0;
+	file->f_flags = O_RDONLY;
+	file->f_op = &kevent_user_fops;
+	file->f_mode = FMODE_READ;
+	file->f_version = 0;
+	file->private_data = u;
+
+	fd_install(fd, file);
+
+	return fd;
+
+err_out_free:
+	kmem_cache_free(kevent_user_cache, u);
+err_out_put_fd:
+	put_unused_fd(fd);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	put_filp(file);
+err_out_exit:
+	return err;
+}
+
+/*
+ * Commits user's index (consumer index).
+ * Must be called under u->ring_lock mutex held.
+ */
+static int __kevent_user_commit(struct kevent_user *u, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EOVERFLOW, comm = 0;
+	struct kevent_ring __user *ring = u->pring;
+
+	if (!ring) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	if (new_uidx >= u->ring_size) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if ((over != u->ring_over - 1) && (over != u->ring_over))
+		goto err_out_exit;
+
+	if (u->uidx < u->kidx && new_uidx > u->kidx) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if (new_uidx > u->uidx) {
+		if (over != u->ring_over)
+			goto err_out_exit;
+
+		comm = new_uidx - u->uidx;
+		u->uidx = new_uidx;
+		u->full = 0;
+	} else if (new_uidx < u->uidx) {
+		comm = u->ring_size - (u->uidx - new_uidx);
+		u->uidx = new_uidx;
+		u->full = 0;
+		u->ring_over++;
+
+		if (put_user(u->ring_over, &ring->ring_over)) {
+			err = -EFAULT;
+			goto err_out_exit;
+		}
+	}
+
+	return comm;
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * This syscall is used to perform waiting until there is free space in the ring
+ * buffer, in that case some events will be copied there.
+ * Function returns number of actually copied ready events in ring buffer.
+ * After this function is completed userspace ring->ring_kidx will be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @num - number of kevents to process.
+ * @old_uidx - the last index user is aware of.
+ * @timeout - time to wait until there is free space in kevent queue.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ *
+ * When we need to commit @num events, it means we should just remove first @num
+ * kevents from ready queue and copy them into the buffer. 
+ * Kevents will be copied into ring buffer in order they were placed into ready queue.
+ * One-shot kevents will be removed here, since there is no way they can be reused.
+ * Edge-triggered events will be requeued here for better performance.
+ */
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags)
+{
+	int err = -EINVAL, copied = 0;
+	struct file *file;
+	struct kevent_user *u;
+	struct kevent *k;
+	struct kevent_ring __user *ring;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+	unsigned int i;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	ring = u->pring;
+	if (!ring || num > u->ring_size)
+		goto out_fput;
+#if 0
+	/*
+	 * Allow to immediately update ring index, but it is not supported,
+	 * since syscall() has limited number of arguments which is actually
+	 * a good idea - use kevent_commit() instead.
+	 */
+	if ((u->uidx != new_uidx) && (new_uidx != 0xffffffff)) {
+		mutex_lock(&u->ring_lock);
+		__kevent_user_commit(u, new_uidx, over);
+		mutex_unlock(&u->ring_lock);
+	}
+#endif
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			goto out_fput;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || 
+				u->need_exit || old_uidx != u->uidx,
+				tm);
+	}
+	u->need_exit = 0;
+
+	for (i=0; i<num; ++i) {
+		k = kevent_dequeue_ready_ring(u);
+		if (!k)
+			break;
+		kevent_complete_ready(k);
+
+		if (k->event.ret_flags & KEVENT_RET_COPY_FAILED)
+			break;
+		kevent_stat_ring(u);
+		copied++;
+	}
+
+	fput(file);
+
+	return copied;
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to commit events in ring buffer, i.e. mark appropriate
+ * entries as unused by userspace so subsequent kevent_wait() could overwrite them.
+ * This fucntion returns actual number of kevents which were committed.
+ * After this function is completed userspace ring->ring_over can be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @new_uidx - the last committed kevent.
+ * @over - number of overflows given queue had.
+ */
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EINVAL, comm = 0;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	mutex_lock(&u->ring_lock);
+	err = __kevent_user_commit(u, new_uidx, over);
+	if (err < 0)
+		goto err_out_unlock;
+	comm = err;
+	mutex_unlock(&u->ring_lock);
+
+	fput(file);
+
+	return comm;
+
+err_out_unlock:
+	mutex_unlock(&u->ring_lock);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create caches and register
+ * filesystem to get control file descriptors from.
+ */
+static int __init kevent_user_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache",
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+	
+	kevent_user_cache = kmem_cache_create("kevent_user_cache",
+			sizeof(struct kevent_user), 0, SLAB_PANIC, NULL, NULL);
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		goto err_out_exit;
+	
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	err = PTR_ERR(kevent_mnt);
+	if (IS_ERR(kevent_mnt))
+		goto err_out_unreg;
+
+	printk(KERN_INFO "KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_unreg:
+	unregister_filesystem(&kevent_fs_type);
+err_out_exit:
+	kmem_cache_destroy(kevent_cache);
+	return err;
+}
+
+module_init(kevent_user_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7a3b2e7..3b7d35f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -122,6 +122,12 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+cond_syscall(sys_kevent_wait);
+cond_syscall(sys_kevent_commit);
+cond_syscall(sys_kevent_init);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 1/8] kevent: Description.
  2006-12-23 16:51 ` [take29 " Evgeniy Polyakov
@ 2006-12-23 16:51   ` Evgeniy Polyakov
  2006-12-23 16:51     ` [take29 2/8] kevent: Core files Evgeniy Polyakov
  2006-12-23 17:10   ` [take29 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Description.


diff --git a/Documentation/kevent.txt b/Documentation/kevent.txt
new file mode 100644
index 0000000..2e03a3f
--- /dev/null
+++ b/Documentation/kevent.txt
@@ -0,0 +1,240 @@
+Description.
+
+int kevent_init(struct kevent_ring *ring, unsigned int ring_size, 
+	unsigned int flags);
+
+num - size of the ring buffer in events 
+ring - pointer to allocated ring buffer
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value: kevent control file descriptor or negative error value.
+
+ struct kevent_ring
+ {
+   unsigned int ring_kidx, ring_over;
+   struct ukevent event[0];
+ }
+
+ring_kidx - index in the ring buffer where kernel will put new events 
+		when kevent_wait() or kevent_get_events() is called 
+ring_over - number of overflows of ring_uidx happend from the start.
+	Overflow counter is used to prevent situation when two threads 
+	are going to free the same events, but one of them was scheduled 
+	away for too long, so ring indexes were wrapped, so when that 
+	thread will be awakened, it will free not those events, which 
+	it suppose to free.
+
+Example userspace code (ring_buffer.c) can be found on project's homepage.
+
+Each kevent syscall can be so called cancellation point in glibc, i.e. when 
+thread has been cancelled in kevent syscall, thread can be safely removed 
+and no events will be lost, since each syscall (kevent_wait() or 
+kevent_get_events()) will copy event into special ring buffer, accessible 
+from other threads or even processes (if shared memory is used).
+
+When kevent is removed (not dequeued when it is ready, but just removed), 
+even if it was ready, it is not copied into ring buffer, since if it is 
+removed, no one cares about it (otherwise user would wait until it becomes 
+ready and got it through usual way using kevent_get_events() or kevent_wait()) 
+and thus no need to copy it to the ring buffer.
+
+-------------------------------------------------------------------------------
+
+
+int kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent *arg);
+
+fd - is the file descriptor referring to the kevent queue to manipulate. 
+It is created by opening "/dev/kevent" char device, which is created with 
+dynamic minor number and major number assigned for misc devices. 
+
+cmd - is the requested operation. It can be one of the following:
+    KEVENT_CTL_ADD - add event notification 
+    KEVENT_CTL_REMOVE - remove event notification 
+    KEVENT_CTL_MODIFY - modify existing notification 
+    KEVENT_CTL_READY - mark existing events as ready, if number of events is zero,
+    	it just wakes up parked in syscall thread
+
+num - number of struct ukevent in the array pointed to by arg 
+arg - array of struct ukevent
+
+Return value: 
+ number of events processed or negative error value.
+
+When called, kevent_ctl will carry out the operation specified in the 
+cmd parameter.
+-------------------------------------------------------------------------------
+
+ int kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, 
+ 		struct timespec timeout, struct ukevent *buf, unsigned flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+min_nr - minimum number of completed events that kevent_get_events will block 
+	 waiting for 
+max_nr - number of struct ukevent in buf 
+timeout - time to wait before returning less than min_nr 
+	  events. If this is -1, then wait forever. 
+buf - pointer to an array of struct ukevent. 
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied or negative error value.
+
+kevent_get_events will wait timeout milliseconds for at least min_nr completed 
+events, copying completed struct ukevents to buf and deleting any 
+KEVENT_REQ_ONESHOT event requests. In nonblocking mode it returns as many 
+events as possible, but not more than max_nr. In blocking mode it waits until 
+timeout or if at least min_nr events are ready.
+
+This function copies event into ring buffer if it was initialized, if ring buffer
+is full, KEVENT_RET_COPY_FAILED flag is set in ret_flags field.
+-------------------------------------------------------------------------------
+
+ int kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+ 	struct timespec timeout, unsigned int flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+num - number of processed kevents 
+old_uidx - the last index user is aware of
+timeout - time to wait until there is free space in kevent queue
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied into ring buffer or negative error value.
+
+This syscall waits until either timeout expires or at least one event becomes 
+ready. It also copies events into special ring buffer. If ring buffer is full,
+it waits until there are ready events and then return.
+If kevent is one-shot kevent it is removed in this syscall.
+If kevent is edge-triggered (KEVENT_REQ_ET flag is set in 'req_flags') it is 
+requeued in this syscall for performance reasons.
+-------------------------------------------------------------------------------
+
+ int kevent_commit(int ctl_fd, unsigned int new_idx, unsigned int over);
+
+ctl_fd - file descriptor referring to the kevent queue 
+new_uidx - the last committed kevent
+over - overflow count for given $new_idx value
+
+Return value:
+ number of committed kevents or negative error value.
+
+This function commits, i.e. marks as empty, slots in the ring buffer, so
+they can be reused when userspace completes that entries processing.
+
+Overflow counter is used to prevent situation when two threads are going 
+to free the same events, but one of them was scheduled away for too long, 
+so ring indexes were wrapped, so when that thread will be awakened, it 
+will free not those events, which it suppose to free.
+
+It is possible that returned number of committed events will be smaller than
+requested number - it is possible when several threads try to commit the
+same events.
+-------------------------------------------------------------------------------
+
+The bulk of the interface is entirely done through the ukevent struct. 
+It is used to add event requests, modify existing event requests, 
+specify which event requests to remove, and return completed events.
+
+struct ukevent contains the following members:
+
+struct kevent_id id
+    Id of this request, e.g. socket number, file descriptor and so on 
+__u32 type
+    Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on 
+__u32 event
+    Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED 
+__u32 req_flags
+    Per-event request flags,
+
+    KEVENT_REQ_ONESHOT
+        event will be removed when it is ready 
+
+    KEVENT_REQ_WAKEUP_ALL
+        Kevent wakes up only first thread interested in given event, 
+	or all threads if this flag is set.
+
+    KEVENT_REQ_ET
+        Edge Triggered behaviour. It is an optimisation which allows to move 
+	ready and dequeued (i.e. copied to userspace) event to move into set 
+	of interest for given storage (socket, inode and so on) again. It is 
+	very usefull for cases when the same event should be used many times 
+	(like reading from pipe). It is similar to epoll()'s EPOLLET flag. 
+
+    KEVENT_REQ_LAST_CHECK
+        if set allows to perform the last check on kevent (call appropriate 
+	callback) when kevent is marked as ready and has been removed from 
+	ready queue. If it will be confirmed that kevent is ready 
+	(k->callbacks.callback(k) returns true) then kevent will be copied 
+	to userspace, otherwise it will be requeued back to storage. 
+	Second (checking) call is performed with this bit cleared, so callback 
+	can detect when it was called from kevent_storage_ready() - bit is set, 
+	or kevent_dequeue_ready() - bit is cleared. If kevent will be requeued, 
+	bit will be set again.
+
+   KEVENT_REQ_ALWAYS_QUEUE
+        If this flag is set kevent will be queued into ready queue if it is 
+	ready at enqueue time, otherwise it will be copied back to userspace
+	and will not be queued into the storage.
+
+__u32 ret_flags
+    Per-event return flags
+
+    KEVENT_RET_BROKEN
+        Kevent is broken 
+
+    KEVENT_RET_DONE
+        Kevent processing was finished successfully 
+
+    KEVENT_RET_COPY_FAILED
+        Kevent was not copied into ring buffer due to some error conditions. 
+
+__u32 ret_data
+    Event return data. Event originator fills it with anything it likes 
+    (for example timer notifications put number of milliseconds when timer 
+    has fired 
+union { __u32 user[2]; void *ptr; }
+    User's data. It is not used, just copied to/from user. The whole structure 
+    is aligned to 8 bytes already, so the last union is aligned properly. 
+
+-------------------------------------------------------------------------------
+
+Kevent waiting syscall flags.
+
+KEVENT_FLAGS_ABSTIME - provided timespec parameter contains absolute time, 
+	for example Aug 27, 2194, or time(NULL) + 10.
+
+-------------------------------------------------------------------------------
+
+Usage
+
+For KEVENT_CTL_ADD, all fields relevant to the event type must be filled 
+(id, type, event, req_flags). 
+After kevent_ctl(..., KEVENT_CTL_ADD, ...) returns each struct's ret_flags 
+should be checked to see if the event is already broken or done.
+
+For KEVENT_CTL_MODIFY, the id, req_flags, and user and event fields must be 
+set and an existing kevent request must have matching id and user fields. If 
+match is found, req_flags and event are replaced with the newly supplied 
+values and requeueing is started, so modified kevent can be checked and 
+probably marked as ready immediately. If a match can't be found, the 
+passed in ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is 
+always set.
+
+For KEVENT_CTL_REMOVE, the id and user fields must be set and an existing 
+kevent request must have matching id and user fields. If a match is found, 
+the kevent request is removed. If a match can't be found, the passed in 
+ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is always set.
+
+For kevent_get_events, the entire structure is returned.
+
+-------------------------------------------------------------------------------
+
+Usage cases
+
+kevent_timer
+struct ukevent should contain following fields:
+    type - KEVENT_TIMER 
+    event - KEVENT_TIMER_FIRED 
+    req_flags - KEVENT_REQ_ONESHOT if you want to fire that timer only once 
+    id.raw[0] - number of seconds after commit when this timer shout expire 
+    id.raw[0] - additional to number of seconds number of nanoseconds 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 3/8] kevent: poll/select() notifications.
  2006-12-23 16:51     ` [take29 2/8] kevent: Core files Evgeniy Polyakov
@ 2006-12-23 16:51       ` Evgeniy Polyakov
  2006-12-23 16:51         ` [take29 4/8] kevent: Socket notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


poll/select() notifications.

This patch includes generic poll/select notifications.
kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake, a lot of allocations and so on).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/file_table.c b/fs/file_table.c
index bc35a40..0805547 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/kevent.h>
 #include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
@@ -119,6 +120,7 @@ struct file *get_empty_filp(void)
 	f->f_uid = tsk->fsuid;
 	f->f_gid = tsk->fsgid;
 	eventpoll_init_file(f);
+	kevent_init_file(f);
 	/* f->f_version: 0 */
 	return f;
 
@@ -164,6 +166,7 @@ void fastcall __fput(struct file *file)
 	 * in the file cleanup chain.
 	 */
 	eventpoll_release(file);
+	kevent_cleanup_file(file);
 	locks_remove_flock(file);
 
 	if (file->f_op && file->f_op->release)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5baf3a1..8bbf3a5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -276,6 +276,7 @@ extern int dir_notify_enable;
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -586,6 +587,10 @@ struct inode {
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -739,6 +744,9 @@ struct file {
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..14094d5
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,234 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static kmem_cache_t *kevent_poll_container_cache;
+static kmem_cache_t *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait,
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont =
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead,
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k =
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, GFP_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err;
+	unsigned int revents;
+	unsigned long flags;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -EBADF;
+	
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+	
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, GFP_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+	
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+	} else {
+		if (revents & k->event.event) {
+			err = 1;
+			goto out_dequeue;
+		}
+	}
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	return 0;
+
+out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_LAST_CHECK) {
+		return 1;
+	} else {
+		struct file *file = k->st->origin;
+		unsigned int revents = file->f_op->poll(file, NULL);
+
+		k->event.ret_data[0] = revents & k->event.event;
+
+		return (revents & k->event.event);
+	}
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks pc = {
+		.callback = &kevent_poll_callback,
+		.enqueue = &kevent_poll_enqueue,
+		.dequeue = &kevent_poll_dequeue,
+		.flags = 0,
+	};
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache",
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache",
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+
+	kevent_add_callbacks(&pc, KEVENT_POLL);
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 4/8] kevent: Socket notifications.
  2006-12-23 16:51       ` [take29 3/8] kevent: poll/select() notifications Evgeniy Polyakov
@ 2006-12-23 16:51         ` Evgeniy Polyakov
  2006-12-23 16:51           ` [take29 5/8] kevent: Timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Socket notifications.

This patch includes socket send/recv/accept notifications.
Using trivial web server based on kevent and this features
instead of epoll it's performance increased more than noticebly.
More details about various benchmarks and server itself 
(evserver_kevent.c) can be found on project's homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/inode.c b/fs/inode.c
index ada7643..2740617 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -164,12 +165,18 @@ static struct inode *alloc_inode(struct super_block *sb)
 		}
 		inode->i_private = 0;
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/include/net/sock.h b/include/net/sock.h
index edd4d73..d48ded8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -48,6 +48,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -450,6 +451,21 @@ static inline int sk_stream_memory_free(struct sock *sk)
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -477,6 +493,7 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -679,21 +696,6 @@ static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0..69f4ad2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -857,6 +857,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..d1a2701
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,144 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	unsigned int events = SOCKET_I(inode)->ops->poll(SOCKET_I(inode)->file, SOCKET_I(inode), NULL);
+
+	if ((events & (POLLIN | POLLRDNORM)) && (k->event.event & (KEVENT_SOCKET_RECV | KEVENT_SOCKET_ACCEPT)))
+		return 1;
+	if ((events & (POLLOUT | POLLWRNORM)) && (k->event.event & KEVENT_SOCKET_SEND))
+		return 1;
+	if (events & (POLLERR | POLLHUP | POLLRDHUP | POLLREMOVE))
+		return -1;
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct inode *inode;
+	struct socket *sock;
+	int err = -EBADF;
+
+	sock = sockfd_lookup(k->event.id.raw[0], &err);
+	if (!sock)
+		goto err_out_exit;
+
+	inode = igrab(SOCK_INODE(sock));
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	sockfd_put(sock);
+err_out_exit:
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct socket *sock;
+
+	kevent_storage_dequeue(k->st, k);
+
+	sock = SOCKET_I(inode);
+	iput(inode);
+	sockfd_put(sock);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket)
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+}
+
+/*
+ * It is required for network protocols compiled as modules, like IPv6.
+ */
+EXPORT_SYMBOL_GPL(kevent_socket_notify);
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key kevent_sock_key;
+
+void kevent_socket_reinit(struct socket *sock)
+{
+	struct inode *inode = SOCK_INODE(sock);
+
+	lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+}
+
+void kevent_sk_reinit(struct sock *sk)
+{
+	if (sk->sk_socket) {
+		struct inode *inode = SOCK_INODE(sk->sk_socket);
+
+		lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+	}
+}
+#endif
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_socket_callback,
+		.enqueue = &kevent_socket_enqueue,
+		.dequeue = &kevent_socket_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SOCKET);
+}
+module_init(kevent_init_socket);
diff --git a/net/core/sock.c b/net/core/sock.c
index b77e155..7d5fa3e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1402,6 +1402,7 @@ static void sock_def_wakeup(struct sock *sk)
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1411,6 +1412,7 @@ static void sock_def_error_report(struct sock *sk)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1420,6 +1422,7 @@ static void sock_def_readable(struct sock *sk, int len)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1439,6 +1442,7 @@ static void sock_def_write_space(struct sock *sk)
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1489,6 +1493,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state		=	TCP_CLOSE;
 	sk->sk_socket		=	sock;
 
+	kevent_sk_reinit(sk);
+
 	sock_set_flag(sk, SOCK_ZAPPED);
 
 	if(sock)
@@ -1555,8 +1561,10 @@ void fastcall release_sock(struct sock *sk)
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f884ce..e7dd989 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3119,6 +3119,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c83938b..b0dd70d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -870,6 +871,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
diff --git a/net/socket.c b/net/socket.c
index 1bc4167..5582b4a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -85,6 +85,7 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -490,6 +491,8 @@ static struct socket *sock_alloc(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 
+	kevent_socket_reinit(sock);
+
 	get_cpu_var(sockets_in_use)++;
 	put_cpu_var(sockets_in_use);
 	return sock;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index b43a278..9e64fb0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1562,8 +1562,10 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct scm_cookie tmp_scm;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
+	struct sock *other;
 	int noblock = flags & MSG_DONTWAIT;
 	struct sk_buff *skb;
+
 	int err;
 
 	err = -EOPNOTSUPP;
@@ -1579,6 +1581,12 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		goto out_unlock;
 
 	wake_up_interruptible(&u->peer_wait);
+	other =unix_peer_get(sk);
+	if (other) {
+		kevent_socket_notify(other, KEVENT_SOCKET_SEND);
+		sock_put(other);
+	} else
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -1673,7 +1681,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 {
 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
 	struct scm_cookie tmp_scm;
-	struct sock *sk = sock->sk;
+	struct sock *sk = sock->sk, *other;
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=msg->msg_name;
 	int copied = 0;
@@ -1802,6 +1810,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 	} while (size);
 
+	other =unix_peer_get(sk);
+	if (other) {
+		kevent_socket_notify(other, KEVENT_SOCKET_SEND);
+		sock_put(other);
+	}
+	if (sk->sk_shutdown & RCV_SHUTDOWN || !other)
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
+
 	mutex_unlock(&u->readlock);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
@@ -1823,6 +1839,10 @@ static int unix_shutdown(struct socket *sock, int mode)
 			sock_hold(other);
 		unix_state_wunlock(sk);
 		sk->sk_state_change(sk);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
+		if (other)
+			kevent_socket_notify(other, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 
 		if (other &&
 			(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 8/8] kevent: Kevent posix timer notifications.
  2006-12-23 16:51               ` [take29 7/8] kevent: Signal notifications Evgeniy Polyakov
@ 2006-12-23 16:51                 ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Kevent posix timer notifications.

Simple extensions to POSIX timers which allows
to deliver notification of the timer expiration
through kevent queue.

Example application posix_timer.c can be found
in archive on project homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 8786e01..3768746 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -235,6 +235,7 @@ typedef struct siginfo {
 #define SIGEV_NONE	1	/* other notification: meaningless */
 #define SIGEV_THREAD	2	/* deliver via thread creation */
 #define SIGEV_THREAD_ID 4	/* deliver to thread */
+#define SIGEV_KEVENT	8	/* deliver through kevent queue */
 
 /*
  * This works because the alignment is ok on all current architectures
@@ -260,6 +261,8 @@ typedef struct sigevent {
 			void (*_function)(sigval_t);
 			void *_attribute;	/* really pthread_attr_t */
 		} _sigev_thread;
+
+		int kevent_fd;
 	} _sigev_un;
 } sigevent_t;
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f..4b9deb4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/kevent_storage.h>
 
 union cpu_time_count {
 	cputime_t cpu;
@@ -49,6 +50,9 @@ struct k_itimer {
 	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
+#ifdef CONFIG_KEVENT_TIMER
+	struct kevent_storage st;
+#endif
 	union {
 		struct {
 			struct hrtimer timer;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5ebcc1..74270f8 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,6 +48,8 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/module.h>
+#include <linux/kevent.h>
+#include <linux/file.h>
 
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
@@ -224,6 +226,100 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+#ifdef CONFIG_KEVENT_TIMER
+static int posix_kevent_enqueue(struct kevent *k)
+{
+	/*
+	 * It is not ugly - there is no pointer in the id field union, 
+	 * but its size is 64bits, which is ok for any known pointer size.
+	 */
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	return kevent_storage_enqueue(&tmr->st, k);
+}
+static int posix_kevent_dequeue(struct kevent *k)
+{
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	kevent_storage_dequeue(&tmr->st, k);
+	return 0;
+}
+static int posix_kevent_callback(struct kevent *k)
+{
+	return 1;
+}
+static int posix_kevent_init(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &posix_kevent_callback,
+		.enqueue = &posix_kevent_enqueue,
+		.dequeue = &posix_kevent_dequeue,
+		.flags = KEVENT_CALLBACKS_KERNELONLY};
+
+	return kevent_add_callbacks(&tc, KEVENT_POSIX_TIMER);
+}
+
+extern struct file_operations kevent_user_fops;
+
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	struct ukevent uk;
+	struct file *file;
+	struct kevent_user *u;
+	int err;
+
+	file = fget(fd);
+	if (!file) {
+		err = -EBADF;
+		goto err_out;
+	}
+
+	if (file->f_op != &kevent_user_fops) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	u = file->private_data;
+
+	memset(&uk, 0, sizeof(struct ukevent));
+
+	uk.event = KEVENT_MASK_ALL;
+	uk.type = KEVENT_POSIX_TIMER;
+	uk.id.raw_u64 = (unsigned long)(tmr); /* Just cast to something unique */
+	uk.req_flags = KEVENT_REQ_ONESHOT | KEVENT_REQ_ALWAYS_QUEUE;
+	uk.ptr = tmr->it_sigev_value.sival_ptr;
+
+	err = kevent_user_add_ukevent(&uk, u);
+	if (err)
+		goto err_out_fput;
+
+	fput(file);
+
+	return 0;
+
+err_out_fput:
+	fput(file);
+err_out:
+	return err;
+}
+
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+	kevent_storage_fini(&tmr->st);
+}
+#else
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	return -ENOSYS;
+}
+static int posix_kevent_init(void)
+{
+	return 0;
+}
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+}
+#endif
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -241,6 +337,11 @@ static __init int init_posix_timers(void)
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
+	if (posix_kevent_init()) {
+		printk(KERN_ERR "Failed to initialize kevent posix timers.\n");
+		BUG();
+	}
+
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, 0, NULL, NULL);
 	idr_init(&posix_timers_id);
@@ -343,23 +444,29 @@ static int posix_timer_fn(struct hrtimer *timer)
 
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
+	
+	if (timr->it_sigev_notify == SIGEV_KEVENT) {
+#ifdef CONFIG_KEVENT_TIMER
+		kevent_storage_ready(&timr->st, NULL, KEVENT_MASK_ALL);
+#endif
+	} else {
+		if (timr->it.real.interval.tv64 != 0)
+			si_private = ++timr->it_requeue_pending;
 
-	if (timr->it.real.interval.tv64 != 0)
-		si_private = ++timr->it_requeue_pending;
-
-	if (posix_timer_event(timr, si_private)) {
-		/*
-		 * signal was not sent because of sig_ignor
-		 * we will not get a call back to restart it AND
-		 * it should be restarted.
-		 */
-		if (timr->it.real.interval.tv64 != 0) {
-			timr->it_overrun +=
-				hrtimer_forward(timer,
-						timer->base->softirq_time,
-						timr->it.real.interval);
-			ret = HRTIMER_RESTART;
-			++timr->it_requeue_pending;
+		if (posix_timer_event(timr, si_private)) {
+			/*
+			 * signal was not sent because of sig_ignor
+			 * we will not get a call back to restart it AND
+			 * it should be restarted.
+			 */
+			if (timr->it.real.interval.tv64 != 0) {
+				timr->it_overrun +=
+					hrtimer_forward(timer,
+							timer->base->softirq_time,
+							timr->it.real.interval);
+				ret = HRTIMER_RESTART;
+				++timr->it_requeue_pending;
+			}
 		}
 	}
 
@@ -407,6 +514,9 @@ static struct k_itimer * alloc_posix_timer(void)
 		kmem_cache_free(posix_timers_cache, tmr);
 		tmr = NULL;
 	}
+#ifdef CONFIG_KEVENT_TIMER
+	kevent_storage_init(tmr, &tmr->st);
+#endif
 	return tmr;
 }
 
@@ -424,6 +534,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	if (unlikely(tmr->it_process) &&
 	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 		put_task_struct(tmr->it_process);
+	posix_kevent_fini_timer(tmr);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -496,40 +607,52 @@ sys_timer_create(const clockid_t which_clock,
 		new_timer->it_sigev_signo = event.sigev_signo;
 		new_timer->it_sigev_value = event.sigev_value;
 
-		read_lock(&tasklist_lock);
-		if ((process = good_sigevent(&event))) {
-			/*
-			 * We may be setting up this process for another
-			 * thread.  It may be exiting.  To catch this
-			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the siglock will catch
-			 * him before it is too late (in exit_itimers).
-			 *
-			 * The exec case is a bit more invloved but easy
-			 * to code.  If the process is in our thread
-			 * group (and it must be or we would not allow
-			 * it here) and is doing an exec, it will cause
-			 * us to be killed.  In this case it will wait
-			 * for us to die which means we can finish this
-			 * linkage with our last gasp. I.e. no code :)
-			 */
+		if (event.sigev_notify == SIGEV_KEVENT) {
+			error = posix_kevent_init_timer(new_timer, event._sigev_un.kevent_fd);
+			if (error)
+				goto out;
+
+			process = current->group_leader;
 			spin_lock_irqsave(&process->sighand->siglock, flags);
-			if (!(process->flags & PF_EXITING)) {
-				new_timer->it_process = process;
-				list_add(&new_timer->list,
-					 &process->signal->posix_timers);
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-					get_task_struct(process);
-			} else {
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				process = NULL;
+			new_timer->it_process = process;
+			list_add(&new_timer->list, &process->signal->posix_timers);
+			spin_unlock_irqrestore(&process->sighand->siglock, flags);
+		} else {
+			read_lock(&tasklist_lock);
+			if ((process = good_sigevent(&event))) {
+				/*
+				 * We may be setting up this process for another
+				 * thread.  It may be exiting.  To catch this
+				 * case the we check the PF_EXITING flag.  If
+				 * the flag is not set, the siglock will catch
+				 * him before it is too late (in exit_itimers).
+				 *
+				 * The exec case is a bit more invloved but easy
+				 * to code.  If the process is in our thread
+				 * group (and it must be or we would not allow
+				 * it here) and is doing an exec, it will cause
+				 * us to be killed.  In this case it will wait
+				 * for us to die which means we can finish this
+				 * linkage with our last gasp. I.e. no code :)
+				 */
+				spin_lock_irqsave(&process->sighand->siglock, flags);
+				if (!(process->flags & PF_EXITING)) {
+					new_timer->it_process = process;
+					list_add(&new_timer->list,
+						 &process->signal->posix_timers);
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+						get_task_struct(process);
+				} else {
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					process = NULL;
+				}
+			}
+			read_unlock(&tasklist_lock);
+			if (!process) {
+				error = -EINVAL;
+				goto out;
 			}
-		}
-		read_unlock(&tasklist_lock);
-		if (!process) {
-			error = -EINVAL;
-			goto out;
 		}
 	} else {
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 7/8] kevent: Signal notifications.
  2006-12-23 16:51             ` [take29 6/8] kevent: Pipe notifications Evgeniy Polyakov
@ 2006-12-23 16:51               ` Evgeniy Polyakov
  2006-12-23 16:51                 ` [take29 8/8] kevent: Kevent posix timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Signal notifications.

This type of notifications allows to deliver signals through kevent queue.
One can find example application signal.c on project homepage.

If KEVENT_SIGNAL_NOMASK bit is set in raw_u64 id then signal will be
delivered only through queue, otherwise both delivery types are used - old
through update of mask of pending signals and through queue.

If signal is delivered only through kevent queue mask of pending signals
is not updated at all, which is equal to putting signal into blocked mask,
but with delivery of that signal through kevent queue.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc4a987..ef38a3c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/processor.h>
 
@@ -1013,6 +1014,10 @@ struct task_struct {
 #ifdef	CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info *delays;
 #endif
+#ifdef CONFIG_KEVENT_SIGNAL
+	struct kevent_storage st;
+	u32 kevent_signals;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff --git a/kernel/fork.c b/kernel/fork.c
index 1c999f3..e5b5b14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -46,6 +46,7 @@
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
+#include <linux/kevent.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -115,6 +116,9 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_fini(&tsk->st);
+#endif
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -1121,6 +1125,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (retval)
 		goto bad_fork_cleanup_namespace;
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_init(p, &p->st);
+#endif
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/kevent/kevent_signal.c b/kernel/kevent/kevent_signal.c
new file mode 100644
index 0000000..abe3972
--- /dev/null
+++ b/kernel/kevent/kevent_signal.c
@@ -0,0 +1,94 @@
+/*
+ * 	kevent_signal.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+
+static int kevent_signal_callback(struct kevent *k)
+{
+	struct task_struct *tsk = k->st->origin;
+	int sig = k->event.id.raw[0];
+	int ret = 0;
+
+	if (sig == tsk->kevent_signals)
+		ret = 1;
+
+	if (ret && (k->event.id.raw_u64 & KEVENT_SIGNAL_NOMASK))
+		tsk->kevent_signals |= 0x80000000;
+
+	return ret;
+}
+
+int kevent_signal_enqueue(struct kevent *k)
+{
+	int err;
+
+	err = kevent_storage_enqueue(&current->st, k);
+	if (err)
+		goto err_out_exit;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_exit:
+	return err;
+}
+
+int kevent_signal_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+	return 0;
+}
+
+int kevent_signal_notify(struct task_struct *tsk, int sig)
+{
+	tsk->kevent_signals = sig;
+	kevent_storage_ready(&tsk->st, NULL, KEVENT_SIGNAL_DELIVERY);
+	return (tsk->kevent_signals & 0x80000000);
+}
+
+static int __init kevent_init_signal(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_signal_callback,
+		.enqueue = &kevent_signal_enqueue,
+		.dequeue = &kevent_signal_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SIGNAL);
+}
+module_init(kevent_init_signal);
diff --git a/kernel/signal.c b/kernel/signal.c
index fb5da6d..d3d3594 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/kevent.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -703,6 +704,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
 	struct sigqueue * q = NULL;
 	int ret = 0;
+	
+	if (kevent_signal_notify(t, sig))
+		return 1;
 
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
@@ -782,6 +786,17 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	ret = send_signal(sig, info, t, &t->pending);
 	if (!ret && !sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 out:
 	return ret;
 }
@@ -971,6 +986,17 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	 * to avoid several races.
 	 */
 	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 	if (unlikely(ret))
 		return ret;
 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 5/8] kevent: Timer notifications.
  2006-12-23 16:51         ` [take29 4/8] kevent: Socket notifications Evgeniy Polyakov
@ 2006-12-23 16:51           ` Evgeniy Polyakov
  2006-12-23 16:51             ` [take29 6/8] kevent: Pipe notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Timer notifications.

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

This subsystem uses high-resolution timers.
id.raw[0] is used as number of seconds
id.raw[1] is used as number of nanoseconds

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..c21a155
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,114 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+struct kevent_timer
+{
+	struct hrtimer		ktimer;
+	struct kevent_storage	ktimer_storage;
+	struct kevent		*ktimer_event;
+};
+
+static int kevent_timer_func(struct hrtimer *timer)
+{
+	struct kevent_timer *t = container_of(timer, struct kevent_timer, ktimer);
+	struct kevent *k = t->ktimer_event;
+
+	kevent_storage_ready(&t->ktimer_storage, NULL, KEVENT_MASK_ALL);
+	hrtimer_forward(timer, timer->base->softirq_time,
+			ktime_set(k->event.id.raw[0], k->event.id.raw[1]));
+	return HRTIMER_RESTART;
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	int err;
+	struct kevent_timer *t;
+
+	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	hrtimer_init(&t->ktimer, CLOCK_MONOTONIC, HRTIMER_REL);
+	t->ktimer.expires = ktime_set(k->event.id.raw[0], k->event.id.raw[1]);
+	t->ktimer.function = kevent_timer_func;
+	t->ktimer_event = k;
+
+	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(&t->ktimer_storage, k);
+	if (err)
+		goto err_out_st_fini;
+
+	hrtimer_start(&t->ktimer, t->ktimer.expires, HRTIMER_REL);
+
+	return 0;
+
+err_out_st_fini:
+	kevent_storage_fini(&t->ktimer_storage);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
+
+	hrtimer_cancel(&t->ktimer);
+	kevent_storage_dequeue(st, k);
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	k->event.ret_data[0] = jiffies_to_msecs(jiffies);
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &kevent_timer_callback,
+		.enqueue = &kevent_timer_enqueue,
+		.dequeue = &kevent_timer_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&tc, KEVENT_TIMER);
+}
+module_init(kevent_init_timer);
+


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take29 6/8] kevent: Pipe notifications.
  2006-12-23 16:51           ` [take29 5/8] kevent: Timer notifications Evgeniy Polyakov
@ 2006-12-23 16:51             ` Evgeniy Polyakov
  2006-12-23 16:51               ` [take29 7/8] kevent: Signal notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 16:51 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim


Pipe notifications.


diff --git a/fs/pipe.c b/fs/pipe.c
index f3b6f71..aeaee9c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -16,6 +16,7 @@
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -312,6 +313,7 @@ redo:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 			wake_up_interruptible_sync(&pipe->wait);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
@@ -321,6 +323,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
@@ -490,6 +493,7 @@ redo2:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 			wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
@@ -501,6 +505,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
@@ -605,6 +610,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 		free_pipe_info(inode);
 	} else {
 		wake_up_interruptible(&pipe->wait);
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
diff --git a/kernel/kevent/kevent_pipe.c b/kernel/kevent/kevent_pipe.c
new file mode 100644
index 0000000..91dc1eb
--- /dev/null
+++ b/kernel/kevent/kevent_pipe.c
@@ -0,0 +1,123 @@
+/*
+ * 	kevent_pipe.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+#include <linux/pipe_fs_i.h>
+
+static int kevent_pipe_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int nrbufs = pipe->nrbufs;
+
+	if (k->event.event & KEVENT_SOCKET_RECV && nrbufs > 0) {
+		if (!pipe->writers)
+			return -1;
+		return 1;
+	}
+	
+	if (k->event.event & KEVENT_SOCKET_SEND && nrbufs < PIPE_BUFFERS) {
+		if (!pipe->readers)
+			return -1;
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_pipe_enqueue(struct kevent *k)
+{
+	struct file *pipe;
+	int err = -EBADF;
+	struct inode *inode;
+
+	pipe = fget(k->event.id.raw[0]);
+	if (!pipe)
+		goto err_out_exit;
+
+	inode = igrab(pipe->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = -EINVAL;
+	if (!S_ISFIFO(inode->i_mode))
+		goto err_out_iput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	fput(pipe);
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(pipe);
+err_out_exit:
+	return err;
+}
+
+int kevent_pipe_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_pipe_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
+
+static int __init kevent_init_pipe(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_pipe_callback,
+		.enqueue = &kevent_pipe_enqueue,
+		.dequeue = &kevent_pipe_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_PIPE);
+}
+module_init(kevent_init_pipe);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-23 16:51 ` [take29 " Evgeniy Polyakov
  2006-12-23 16:51   ` [take29 1/8] kevent: Description Evgeniy Polyakov
@ 2006-12-23 17:10   ` Evgeniy Polyakov
  2006-12-28 15:56   ` Ingo Molnar
  2006-12-28 16:01   ` Ingo Molnar
  3 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-23 17:10 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik

On Sat, Dec 23, 2006 at 07:51:40PM +0300, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> 
> Generic event handling mechanism.
> 
> Kevent is a generic subsytem which allows to handle event notifications.
> It supports both level and edge triggered events. It is similar to
> poll/epoll in some cases, but it is more scalable, it is faster and
> allows to work with essentially eny kind of events.
> 
> Events are provided into kernel through control syscall and can be read
> back through ring buffer or using usual syscalls.
> Kevent update (i.e. readiness switching) happens directly from internals
> of the appropriate state machine of the underlying subsytem (like
> network, filesystem, timer or any other).
> 
> Homepage:
> http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent
> 
> Documentation page:
> http://linux-net.osdl.org/index.php/Kevent
> 
> Consider for inclusion.
> 
> New benchmark, which can be a hoax though, can be found at 
> http://tservice.net.ru/~s0mbre/blog/2006/11/30#2006_11_30
> where kevent on amd64 with 1gb of ram can handle more than 7200 events per 
> second with 8000 requests concurrency with 'ab' benchmark and lighttpd.
> Although I tought it should not be published due to possible errors,
> I decided to send it for review.
> 
> With this release I start 3 days resending timeout - i.e. each third day I 
> will send either new version (if something new was requested and agreed to 
> be implemented) or resending with back counter started from three. 
> When back counter hits zero after three resending I consider there is no 
> interest in subsystem and I will stop further sending.
> First one will be sent about Jan 10.
> 
> Thanks for understanding and your time.
> 
> Changes from 'take28' patchset:
>  * optimized af_unix to use socket notifications
>  * changed ALWAYS_QUEUE behaviour with poll/select notifications - previously
>  	kevent was not queued into poll wait queue when ALWAYS_QUEUE flag
> 	is set
>  * added KEVENT_POLL_POLLRDHUP definition into ukevent.h header
>  * libevent-1.2 patch (Jamal, your request is completed, so I'm waiting two weeks
>  	before starting final countdown :)
> 	All regression tests passed successfully except test_evbuffer(), which
> 	crashes on my amd64 linux 2.6 test machine for all types of notifications,
> 	probably it is fixed in libevent-1.2a version, I did not check.
> 	Patch and README can be found at project homepage.

P.S. all kernel kevent options must be turned on (namely kevent_poll,
kevent_socket and kevent_pipe).
I did not hack 'configure' to check for supported notification types.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-23 16:51 ` [take29 " Evgeniy Polyakov
  2006-12-23 16:51   ` [take29 1/8] kevent: Description Evgeniy Polyakov
  2006-12-23 17:10   ` [take29 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2006-12-28 15:56   ` Ingo Molnar
  2006-12-29  8:48     ` Evgeniy Polyakov
  2006-12-28 16:01   ` Ingo Molnar
  3 siblings, 1 reply; 76+ messages in thread
From: Ingo Molnar @ 2006-12-28 15:56 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik, Jamal Hadi Salim


* Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> Generic event handling mechanism.

it would be /very/ helpful to state against which kernel tree the 
patch-queue is. It does not apply to 2.6.20-rc1 nor to -rc2 nor to 
2.6.19. At which point i gave up ...

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-23 16:51 ` [take29 " Evgeniy Polyakov
                     ` (2 preceding siblings ...)
  2006-12-28 15:56   ` Ingo Molnar
@ 2006-12-28 16:01   ` Ingo Molnar
  2006-12-29  8:55     ` Evgeniy Polyakov
  3 siblings, 1 reply; 76+ messages in thread
From: Ingo Molnar @ 2006-12-28 16:01 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik, Jamal Hadi Salim


* Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> Generic event handling mechanism.

i see it covers alot of event sources, but i cannot see block IO 
notifications. Am i missing something?

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-28 15:56   ` Ingo Molnar
@ 2006-12-29  8:48     ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29  8:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik, Jamal Hadi Salim

On Thu, Dec 28, 2006 at 04:56:45PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> 
> > Generic event handling mechanism.
> 
> it would be /very/ helpful to state against which kernel tree the 
> patch-queue is. It does not apply to 2.6.20-rc1 nor to -rc2 nor to 
> 2.6.19. At which point i gave up ...

It was against 2.6.18 (d4397acde6fd047f13c744e5471a9bfe287f78a3) git.
Next patchset with possibility to inject already read event and
userspace reserved notifications will be against the latest tree (I will
push it today before final New Year celebration started).

> 	Ingo

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-28 16:01   ` Ingo Molnar
@ 2006-12-29  8:55     ` Evgeniy Polyakov
  2006-12-29 12:54       ` Ingo Molnar
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29  8:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik, Jamal Hadi Salim

On Thu, Dec 28, 2006 at 05:01:37PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> 
> > Generic event handling mechanism.
> 
> i see it covers alot of event sources, but i cannot see block IO 
> notifications. Am i missing something?

Depending on what it is :)
If you mean kevent based AIO, then it was dropped to reduce size of the
patchset, and in favour of new AIO design.
Other kinds of read/write notifications can be handled by poll/select
notifications.

> 	Ingo

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 3/9] kevent: poll/select() notifications.
  2006-12-29 12:25     ` [take30 2/9] kevent: Core files Evgeniy Polyakov
@ 2006-12-29 12:25       ` Evgeniy Polyakov
  2006-12-29 12:25         ` [take30 4/9] kevent: Socket notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


poll/select() notifications.

This patch includes generic poll/select notifications.
kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake, a lot of allocations and so on).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/file_table.c b/fs/file_table.c
index 4c17a18..46f458c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/kevent.h>
 #include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
@@ -119,6 +120,7 @@ struct file *get_empty_filp(void)
 	f->f_uid = tsk->fsuid;
 	f->f_gid = tsk->fsgid;
 	eventpoll_init_file(f);
+	kevent_init_file(f);
 	/* f->f_version: 0 */
 	return f;
 
@@ -164,6 +166,7 @@ void fastcall __fput(struct file *file)
 	 * in the file cleanup chain.
 	 */
 	eventpoll_release(file);
+	kevent_cleanup_file(file);
 	locks_remove_flock(file);
 
 	if (file->f_op && file->f_op->release)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 186da81..62ef137 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -280,6 +280,7 @@ extern int dir_notify_enable;
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/mutex.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -578,6 +579,10 @@ struct inode {
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -737,6 +742,9 @@ struct file {
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..58129fa
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,234 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static struct kmem_cache *kevent_poll_container_cache;
+static struct kmem_cache *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait,
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont =
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead,
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k =
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, GFP_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err;
+	unsigned int revents;
+	unsigned long flags;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -EBADF;
+	
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+	
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, GFP_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+	
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+	} else {
+		if (revents & k->event.event) {
+			err = 1;
+			goto out_dequeue;
+		}
+	}
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	return 0;
+
+out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_LAST_CHECK) {
+		return 1;
+	} else {
+		struct file *file = k->st->origin;
+		unsigned int revents = file->f_op->poll(file, NULL);
+
+		k->event.ret_data[0] = revents & k->event.event;
+
+		return (revents & k->event.event);
+	}
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks pc = {
+		.callback = &kevent_poll_callback,
+		.enqueue = &kevent_poll_enqueue,
+		.dequeue = &kevent_poll_dequeue,
+		.flags = 0,
+	};
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache",
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache",
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+
+	kevent_add_callbacks(&pc, KEVENT_POLL);
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 0/9] kevent: Generic event handling mechanism.
       [not found] <3154985aa0591036@2ka.mipt.ru>
                   ` (2 preceding siblings ...)
  2006-12-23 16:51 ` [take29 " Evgeniy Polyakov
@ 2006-12-29 12:25 ` Evgeniy Polyakov
  2006-12-29 12:25   ` [take30 1/9] kevent: Description Evgeniy Polyakov
  2007-01-08 19:25 ` [take31 0/10] kevent: Generic event handling mechanism Evgeniy Polyakov
  4 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Generic event handling mechanism.

Kevent is a generic subsytem which allows to handle event notifications.
It supports both level and edge triggered events. It is similar to
poll/epoll in some cases, but it is more scalable, it is faster and
allows to work with essentially eny kind of events.

Events are provided into kernel through control syscall and can be read
back through ring buffer or using usual syscalls.
Kevent update (i.e. readiness switching) happens directly from internals
of the appropriate state machine of the underlying subsytem (like
network, filesystem, timer or any other).

Homepage:
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Documentation page:
http://linux-net.osdl.org/index.php/Kevent

Consider for inclusion.

Changes from 'take29' patchset:
 * new private userspace notifications - allows to queue any userspace private 
 	event and then mark it as ready using kevent_ctl(KEVENT_READY) command
 * KEVENT_REQ_READY flag - if set kevent will be marked as ready at enqueue time
 * port to 2.6.20-rc2 tree (54abb5fcdae74a811ed440ec6556cabc6b24f404 commit)
 * use struct kmem_cache instead of kmem_cache_t
 * added notificaion type into search key, this allows to have the same id for 
 	different types of notifications

Changes from 'take28' patchset:
 * optimized af_unix to use socket notifications
 * changed ALWAYS_QUEUE behaviour with poll/select notifications - previously
 	kevent was not queued into poll wait queue when ALWAYS_QUEUE flag
	is set
 * added KEVENT_POLL_POLLRDHUP definition into ukevent.h header
 * libevent-1.2 patch (Jamal, your request is completed, so I'm waiting two weeks
 	before starting final countdown :)
	All regression tests passed successfully except test_evbuffer(), which is
	crashed on my amd64 linux 2.6 test machine for all types of notifications,
	probably it was fixed in libevent-1.2a version, I did not check.
	Patch and README can be found at project homepage.

Changes from 'take27' patchset:
 * made kevent default yes in non embedded case.
 * added falgs to callback structures - currently used to check if kevent
 	can be requested from kernelspace only (posix timers) or 
	userspace (all others)

Changes from 'take26' patchset:
 * made kevent visible in config only in case of embedded setup.
 * added comment about KEVENT_MAX number.
 * spell fix.

Changes from 'take25' patchset:
 * use timespec as timeout parameter.
 * added high-resolution timer to handle absolute timeouts.
 * added flags to waiting and initialization syscalls.
 * kevent_commit() has new_uidx parameter.
 * kevent_wait() has old_uidx parameter, which, if not equal to u->uidx,
 	results in immediate wakeup (usefull for the case when entries
	are added asynchronously from kernel (not supported for now)).
 * added interface to mark any event as ready.
 * event POSIX timers support.
 * return -ENOSYS if there is no registered event type.
 * provided file descriptor must be checked for fifo type (spotted by Eric Dumazet).
 * signal notifications.
 * documentation update.
 * lighttpd patch updated (the latest benchmarks with lighttpd patch can be found in blog).

Changes from 'take24' patchset:
 * new (old (new)) ring buffer implementation with kernel and user indexes.
 * added initialization syscall instead of opening /dev/kevent
 * kevent_commit() syscall to commit ring buffer entries
 * changed KEVENT_REQ_WAKEUP_ONE flag to KEVENT_REQ_WAKEUP_ALL, kevent wakes
   only first thread always if that flag is not set
 * KEVENT_REQ_ALWAYS_QUEUE flag. If set, kevent will be queued into ready queue
   instead of copying back to userspace when kevent is ready immediately when
   it is added.
 * lighttpd patch (Hail! Although nothing really outstanding compared to epoll)

Changes from 'take23' patchset:
 * kevent PIPE notifications
 * KEVENT_REQ_LAST_CHECK flag, which allows to perform last check at dequeueing time
 * fixed poll/select notifications (were broken due to tree manipulations)
 * made Documentation/kevent.txt look nice in 80-col terminal
 * fix for copy_to_user() failure report for the first kevent (Andrew Morton)
 * minor function renames

Changes from 'take22' patchset:
 * new ring buffer implementation in process' memory
 * wakeup-one-thread flag
 * edge-triggered behaviour

Changes from 'take21' patchset:
 * minor cleanups (different return values, removed unneded variables, whitespaces and so on)
 * fixed bug in kevent removal in case when kevent being removed
   is the same as overflow_kevent (spotted by Eric Dumazet)

Changes from 'take20' patchset:
 * new ring buffer implementation
 * removed artificial limit on possible number of kevents

Changes from 'take19' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take18' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take17' patchset:
 * Use RB tree instead of hash table. 
	At least for a web sever, frequency of addition/deletion of new kevent 
	is comparable with number of search access, i.e. most of the time events 
	are added, accesed only couple of times and then removed, so it justifies 
	RB tree usage over AVL tree, since the latter does have much slower deletion 
	time (max O(log(N)) compared to 3 ops), 
	although faster search time (1.44*O(log(N)) vs. 2*O(log(N))). 
	So for kevents I use RB tree for now and later, when my AVL tree implementation 
	is ready, it will be possible to compare them.
 * Changed readiness check for socket notifications.

With both above changes it is possible to achieve more than 3380 req/second compared to 2200, 
sometimes 2500 req/second for epoll() for trivial web-server and httperf client on the same
hardware.
It is possible that above kevent limit is due to maximum allowed kevents in a time limit, which is
4096 events.

Changes from 'take16' patchset:
 * misc cleanups (__read_mostly, const ...)
 * created special macro which is used for mmap size (number of pages) calculation
 * export kevent_socket_notify(), since it is used in network protocols which can be 
	built as modules (IPv6 for example)

Changes from 'take15' patchset:
 * converted kevent_timer to high-resolution timers, this forces timer API update at
	http://linux-net.osdl.org/index.php/Kevent
 * use struct ukevent* instead of void * in syscalls (documentation has been updated)
 * added warning in kevent_add_ukevent() if ring has broken index (for testing)

Changes from 'take14' patchset:
 * added kevent_wait()
    This syscall waits until either timeout expires or at least one event
    becomes ready. It also commits that @num events from @start are processed
    by userspace and thus can be be removed or rearmed (depending on it's flags).
    It can be used for commit events read by userspace through mmap interface.
    Example userspace code (evtest.c) can be found on project's homepage.
 * added socket notifications (send/recv/accept)

Changes from 'take13' patchset:
 * do not get lock aroung user data check in __kevent_search()
 * fail early if there were no registered callbacks for given type of kevent
 * trailing whitespace cleanup

Changes from 'take12' patchset:
 * remove non-chardev interface for initialization
 * use pointer to kevent_mring instead of unsigned longs
 * use aligned 64bit type in raw user data (can be used by high-res timer if needed)
 * simplified enqueue/dequeue callbacks and kevent initialization
 * use nanoseconds for timeout
 * put number of milliseconds into timer's return data
 * move some definitions into user-visible header
 * removed filenames from comments

Changes from 'take11' patchset:
 * include missing headers into patchset
 * some trivial code cleanups (use goto instead of if/else games and so on)
 * some whitespace cleanups
 * check for ready_callback() callback before main loop which should save us some ticks

Changes from 'take10' patchset:
 * removed non-existent prototypes
 * added helper function for kevent_registered_callbacks
 * fixed 80 lines comments issues
 * added shared between userspace and kernelspace header instead of embedd them in one
 * core restructuring to remove forward declarations
 * s o m e w h i t e s p a c e c o d y n g s t y l e c l e a n u p
 * use vm_insert_page() instead of remap_pfn_range()

Changes from 'take9' patchset:
 * fixed ->nopage method

Changes from 'take8' patchset:
 * fixed mmap release bug
 * use module_init() instead of late_initcall()
 * use better structures for timer notifications

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was
learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from
userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 2/9] kevent: Core files.
  2006-12-29 12:25   ` [take30 1/9] kevent: Description Evgeniy Polyakov
@ 2006-12-29 12:25     ` Evgeniy Polyakov
  2006-12-29 12:25       ` [take30 3/9] kevent: poll/select() notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Core files.

This patch includes core kevent files:
 * userspace controlling
 * kernelspace interfaces
 * initialization
 * notification state machines

Some bits of documentation can be found on project's homepage (and links from there):
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 2697e92..076f11b 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -319,3 +319,8 @@ ENTRY(sys_call_table)
 	.long sys_move_pages
 	.long sys_getcpu
 	.long sys_epoll_pwait
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl		/* 320 */
+	.long sys_kevent_wait
+	.long sys_kevent_commit
+	.long sys_kevent_init
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index b4aa875..dda2168 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -714,8 +714,13 @@ ia32_sys_call_table:
 	.quad compat_sys_get_robust_list
 	.quad sys_splice
 	.quad sys_sync_file_range
-	.quad sys_tee
+	.quad sys_tee			/* 315 */
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
 	.quad sys_getcpu
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl		/* 320 */
+	.quad sys_kevent_wait
+	.quad sys_kevent_commit
+	.quad sys_kevent_init
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 833fa17..0d2cef7 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -325,10 +325,15 @@
 #define __NR_move_pages		317
 #define __NR_getcpu		318
 #define __NR_epoll_pwait	319
+#define __NR_kevent_get_events	320
+#define __NR_kevent_ctl		321
+#define __NR_kevent_wait	322
+#define __NR_kevent_commit	323
+#define __NR_kevent_init	324
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 320
+#define NR_syscalls 325
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index c5f596e..fe052fe 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,8 +619,18 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
-
-#define __NR_syscall_max __NR_move_pages
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait	282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
+#define __NR_kevent_commit	283
+__SYSCALL(__NR_kevent_commit, sys_kevent_commit)
+#define __NR_kevent_init	284
+__SYSCALL(__NR_kevent_init, sys_kevent_init)
+
+#define __NR_syscall_max __NR_kevent_init
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..3be5d93
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,247 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
+#include <linux/ukevent.h>
+
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+/* @flags - flags for given set of callbacks. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+	unsigned int		flags;
+};
+
+#define KEVENT_CALLBACKS_KERNELONLY	0x1
+
+int kevent_event_is_allowed(struct ukevent *e);
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	/* Used for kevent freeing.*/
+	struct rcu_head		rcu_head;
+	struct ukevent		event;
+	/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+	spinlock_t		ulock;
+
+	/* Entry of user's tree. */
+	struct rb_node		kevent_node;
+	/* Entry of origin's queue. */
+	struct list_head	storage_entry;
+	/* Entry of user's ready. */
+	struct list_head	ready_entry;
+
+	u32			flags;
+
+	/* User who requested this kevent. */
+	struct kevent_user	*user;
+	/* Kevent container. */
+	struct kevent_storage	*st;
+
+	struct kevent_callbacks	callbacks;
+
+	/* Private data for different storages.
+	 * poll()/select storage has a list of wait_queue_t containers
+	 * for each ->poll() { poll_wait()' } here.
+	 */
+	void			*priv;
+};
+
+struct kevent_user
+{
+	struct rb_root		kevent_root;
+	spinlock_t		kevent_lock;
+	/* Number of queued kevents. */
+	unsigned int		kevent_num;
+
+	/* List of ready kevents. */
+	struct list_head	ready_list;
+	/* Number of ready kevents. */
+	unsigned int		ready_num;
+	/* Protects all manipulations with ready queue. */
+	spinlock_t 		ready_lock;
+
+	/* Protects against simultaneous kevent_user control manipulations. */
+	struct mutex		ctl_mutex;
+	/* Wait until some events are ready. */
+	wait_queue_head_t	wait;
+	/* Exit from syscall if someone wants us to do it */
+	int			need_exit;
+
+	/* Reference counter, increased for each new kevent. */
+	atomic_t		refcnt;
+
+	/* Mutex protecting userspace ring buffer. */
+	struct mutex		ring_lock;
+	/* Kernel index and size of the userspace ring buffer. */
+	unsigned int		kidx, uidx, ring_size, ring_over, full;
+	/* Pointer to userspace ring buffer. */
+	struct kevent_ring __user *pring;
+
+	/* Is used for absolute waiting times. */
+	struct hrtimer		timer;
+
+	/* Used for userspace private notifications. */
+	struct kevent_storage	st;
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num, ring_num;
+	unsigned long		total;
+#endif
+};
+
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos);
+
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+void kevent_ready(struct kevent *k, int ret);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = u->ring_num = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	printk(KERN_INFO "%s: u: %p, wait: %lu, ring: %lu, immediately: %lu, total: %lu.\n",
+			__func__, u, u->wait_num, u->ring_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_ring(struct kevent_user *u)
+{
+	u->ring_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_ring(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+#ifdef CONFIG_KEVENT_SOCKET
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_POLL
+static inline void kevent_init_file(struct file *file)
+{
+	kevent_storage_init(file, &file->st);
+}
+
+static inline void kevent_cleanup_file(struct file *file)
+{
+	kevent_storage_fini(&file->st);
+}
+#else
+static inline void kevent_init_file(struct file *file) {}
+static inline void kevent_cleanup_file(struct file *file) {}
+#endif
+
+#ifdef CONFIG_KEVENT_PIPE
+extern void kevent_pipe_notify(struct inode *inode, u32 events);
+#else
+static inline void kevent_pipe_notify(struct inode *inode, u32 events) {}
+#endif
+
+#ifdef CONFIG_KEVENT_SIGNAL
+extern int kevent_signal_notify(struct task_struct *tsk, int sig);
+#else
+static inline int kevent_signal_notify(struct task_struct *tsk, int sig) {return 0;}
+#endif
+
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6c..5f68945 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,8 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct ukevent;
+struct kevent_ring;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -603,6 +605,14 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, struct ukevent __user *buf);
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags);
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over);
+asmlinkage long sys_kevent_init(int ctl_fd, struct kevent_ring __user *ring, unsigned int num, unsigned int flags);
+
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 #endif
diff --git a/include/linux/ukevent.h b/include/linux/ukevent.h
new file mode 100644
index 0000000..d975407
--- /dev/null
+++ b/include/linux/ukevent.h
@@ -0,0 +1,191 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __UKEVENT_H
+#define __UKEVENT_H
+
+#include <linux/types.h>
+
+/*
+ * Kevent request flags.
+ */
+
+/* Process this event only once and then remove it. */
+#define KEVENT_REQ_ONESHOT	0x1
+/* Kevent wakes up only first thread interested in given event,
+ * or all threads if this flag is set.
+ */
+#define KEVENT_REQ_WAKEUP_ALL	0x2
+/* Edge Triggered behaviour. */
+#define KEVENT_REQ_ET		0x4
+/* Perform the last check on kevent (call appropriate callback) when
+ * kevent is marked as ready and has been removed from ready queue.
+ * If it will be confirmed that kevent is ready 
+ * (k->callbacks.callback(k) returns true) then kevent will be copied
+ * to userspace, otherwise it will be requeued back to storage. 
+ * Second (checking) call is performed with this bit _cleared_ so
+ * callback can detect when it was called from 
+ * kevent_storage_ready() - bit is set, or 
+ * kevent_dequeue_ready() - bit is cleared. 
+ * If kevent will be requeued, bit will be set again. */
+#define KEVENT_REQ_LAST_CHECK	0x8
+/*
+ * Always queue kevent even if it is immediately ready.
+ */
+#define KEVENT_REQ_ALWAYS_QUEUE	0x10
+/*
+ * Mark event as ready immediately on enqueueing time.
+ */
+#define KEVENT_REQ_READY	0x20
+
+/*
+ * Kevent return flags.
+ */
+/* Kevent is broken. */
+#define KEVENT_RET_BROKEN	0x1
+/* Kevent processing was finished successfully. */
+#define KEVENT_RET_DONE		0x2
+/* Kevent was not copied into ring buffer due to some error conditions. */
+#define KEVENT_RET_COPY_FAILED	0x4
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define KEVENT_PIPE		6
+#define KEVENT_SIGNAL		7
+#define KEVENT_POSIX_TIMER	8
+#define KEVENT_UNOTIFY		9
+
+/* Used as array size, so must be equal to the last KEVENT type + 1 */
+#define	KEVENT_MAX		10
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO and PIPE events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+#define KEVENT_POLL_POLLRDHUP	0x2000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+/*
+ * Signal events.
+ */
+#define KEVENT_SIGNAL_DELIVERY		0x1
+
+/* If set in raw64, then given signals will not be delivered
+ * in a usual way through sigmask update and signal callback 
+ * invocation. */
+#define KEVENT_SIGNAL_NOMASK	0x8000000000000000ULL
+
+/* Mask of all possible event values. */
+#define KEVENT_MASK_ALL		0xffffffff
+/* Empty mask of ready events. */
+#define KEVENT_MASK_EMPTY	0x0
+
+struct kevent_id
+{
+	union {
+		__u32		raw[2];
+		__u64		raw_u64 __attribute__((aligned(8)));
+	};
+};
+
+struct ukevent
+{
+	/* Id of this request, e.g. socket number, file descriptor and so on... */
+	struct kevent_id	id;
+	/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			type;
+	/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			event;
+	/* Per-event request flags */
+	__u32			req_flags;
+	/* Per-event return flags */
+	__u32			ret_flags;
+	/* Event return data. Event originator fills it with anything it likes. */
+	__u32			ret_data[2];
+	/* User's data. It is not used, just copied to/from user.
+	 * The whole structure is aligned to 8 bytes already, so the last union
+	 * is aligned properly.
+	 */
+	union {
+		__u32		user[2];
+		void		*ptr;
+	};
+};
+
+struct kevent_ring
+{
+	unsigned int		ring_kidx, ring_over;
+	struct ukevent		event[0];
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_READY	3
+
+/* Provided timespec parameter uses absolute time, i.e. 'wait until Aug 27, 2194' */
+#define KEVENT_FLAGS_ABSTIME	1
+
+#endif /* __UKEVENT_H */
diff --git a/init/Kconfig b/init/Kconfig
index a3f83e2..561a0fa 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -219,6 +219,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	tristate "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index 14f4d45..6bccdbb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..2e9874b
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,74 @@
+config KEVENT
+	bool "Kernel event notification mechanism" if EMBEDDED
+	default y
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback
+	  invocations, advanced timer notifications and other kernel
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents
+	  which are ready immediately at insertion time and number of kevents
+	  which were removed through readiness completion.
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use kevent subsystem for poll()/select()
+	  notifications.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+
+config KEVENT_PIPE
+	bool "Kernel event notifications for pipes"
+	depends on KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  pipe read/write operations.
+
+config KEVENT_SIGNAL
+	bool "Kernel event notifications for signals"
+	depends on KEVENT
+	default y
+	help
+	  This option enables signal delivery through KEVENT subsystem.
+	  Signals which were requested to be delivered through kevent
+	  subsystem must be registered through usual signal() and others
+	  syscalls, this option allows alternative delivery.
+	  With KEVENT_SIGNAL_NOMASK flag being set in kevent for set of 
+	  signals, they will not be delivered in a usual way.
+	  Kevents for appropriate signals are not copied when process forks,
+	  new process must add new kevents after fork(). Mask of signals
+	  is copied as before.
+
+config KEVENT_UNOTIFY
+	bool "Private userspace notifications over kevent"
+	depends on KEVENT
+	default y
+	help
+	  This option enable possibility to insert private userspace events,
+	  which can be marked as ready on demand using kevent_ctl(KEVENT_CTL_READY)
+	  command.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..6fee3da
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,7 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_PIPE) += kevent_pipe.o
+obj-$(CONFIG_KEVENT_SIGNAL) += kevent_signal.o
+obj-$(CONFIG_KEVENT_UNOTIFY) += kevent_unotify.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..743cd0c
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,267 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return -EINVAL;
+}
+
+static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX] __read_mostly;
+
+int kevent_event_is_allowed(struct ukevent *e)
+{
+	if (unlikely(e->type >= KEVENT_MAX))
+		return 0;
+
+	if (!kevent_registered_callbacks[e->type].callback)
+		return 0;
+
+	if (unlikely(kevent_registered_callbacks[e->type].callback == kevent_break))
+		return 0;
+
+	if (kevent_registered_callbacks[e->type].flags & KEVENT_CALLBACKS_KERNELONLY)
+		return 0;
+	
+	return 1;
+}
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos)
+{
+	struct kevent_callbacks *p;
+
+	if (pos >= KEVENT_MAX)
+		return -EINVAL;
+
+	p = &kevent_registered_callbacks[pos];
+
+	p->enqueue = (cb->enqueue) ? cb->enqueue : kevent_break;
+	p->dequeue = (cb->dequeue) ? cb->dequeue : kevent_break;
+	p->callback = (cb->callback) ? cb->callback : kevent_break;
+	p->flags = cb->flags;
+
+	printk(KERN_INFO "KEVENT: Added callbacks for type %d.\n", pos);
+	return 0;
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (unlikely(k->event.type >= KEVENT_MAX)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	if (!kevent_registered_callbacks[k->event.type].callback) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (unlikely(k->callbacks.callback == kevent_break)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	if (k->event.req_flags & KEVENT_REQ_READY)
+		kevent_ready(k, 1);
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue.
+ * It does not decrease origin's reference counter in any way
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+void kevent_ready(struct kevent *k, int ret)
+{
+	unsigned long flags;
+	int rem;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0)
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	else if (ret < 0)
+		k->event.ret_flags |= (KEVENT_RET_BROKEN | KEVENT_RET_DONE);
+	else
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && (k->flags & KEVENT_STORAGE)) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static int __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret;
+
+	ret = k->callbacks.callback(k);
+
+	kevent_ready(k, ret);
+
+	return ret;
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+	int wake_num = 0;
+
+	rcu_read_lock();
+	if (unlikely(ready_callback))
+		list_for_each_entry_rcu(k, &st->list, storage_entry)
+			(*ready_callback)(k);
+
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (event & k->event.event)
+			if ((k->event.req_flags & KEVENT_REQ_WAKEUP_ALL) || wake_num == 0)
+				if (__kevent_requeue(k, event))
+					wake_num++;
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, socket and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..88bee6c
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1371 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static struct kmem_cache *kevent_cache __read_mostly;
+static struct kmem_cache *kevent_user_cache __read_mostly;
+
+static int kevent_debug_abstime;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num || u->need_exit)
+		mask |= POLLIN | POLLRDNORM;
+	u->need_exit = 0;
+
+	return mask;
+}
+
+static inline unsigned int kevent_ring_space(struct kevent_user *u)
+{
+	if (!u->pring)
+		return 1;
+
+	if (u->full)
+		return 0;
+
+	return (u->uidx > u->kidx)?
+		(u->uidx - u->kidx):
+		(u->ring_size - (u->kidx - u->uidx));
+}
+
+static inline int kevent_ring_index_inc(unsigned int *pidx, unsigned int size)
+{
+	unsigned int idx = *pidx;
+
+	if (++idx >= size)
+		idx = 0;
+	*pidx = idx;
+	return (idx == 0);
+}
+
+/*
+ * Copies kevent into userspace ring buffer if it was initialized.
+ * Returns 
+ *  0 on success or if ring buffer is not used
+ *  -EAGAIN if there were no place for that kevent
+ *  -EFAULT if copy_to_user() failed.
+ *
+ *  Must be called under kevent_user->ring_lock locked.
+ */
+static int kevent_copy_ring_buffer(struct kevent *k)
+{
+	struct kevent_ring __user *ring;
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+	int err;
+
+	ring = u->pring;
+	if (!ring)
+		return 0;
+
+	if (!kevent_ring_space(u))
+		return -EAGAIN;
+
+	if (copy_to_user(&ring->event[u->kidx], &k->event, sizeof(struct ukevent))) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	kevent_ring_index_inc(&u->kidx, u->ring_size);
+
+	if (u->kidx == u->uidx)
+		u->full = 1;
+
+	if (put_user(u->kidx, &ring->ring_kidx)) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return err;
+}
+
+static struct kevent_user *kevent_user_alloc(struct kevent_ring __user *ring, unsigned int num)
+{
+	struct kevent_user *u;
+
+	u = kmem_cache_alloc(kevent_user_cache, GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	u->kevent_root = RB_ROOT;
+
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+	u->need_exit = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	mutex_init(&u->ring_lock);
+	u->kidx = u->uidx = u->ring_over = u->full = 0;
+
+	u->pring = ring;
+	u->ring_size = num;
+
+	hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+
+	return u;
+}
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_stat_print(u);
+		hrtimer_cancel(&u->timer);
+		kmem_cache_free(kevent_user_cache, u);
+	}
+}
+
+static inline int kevent_compare_id(struct kevent_id *left, struct kevent_id *right, 
+		__u32 type_left, __u32 type_right)
+{
+	if (left->raw_u64 > right->raw_u64)
+		return -1;
+
+	if (right->raw_u64 > left->raw_u64)
+		return 1;
+
+	if (type_left > type_right)
+		return -1;
+
+	if (type_left < type_right)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Must be called under u->ready_lock.
+ * This function unlinks kevent from ready queue.
+ */
+static inline void kevent_unlink_ready(struct kevent *k)
+{
+	list_del(&k->ready_entry);
+	k->flags &= ~KEVENT_READY;
+	k->user->ready_num--;
+}
+
+static void kevent_remove_ready(struct kevent *k)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY)
+		kevent_unlink_ready(k);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	if (deq)
+		kevent_dequeue(k);
+
+	kevent_remove_ready(k);
+
+	kevent_user_put(k->user);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events,
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+static struct kevent *__kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	if (u->ready_num) {
+		spin_lock_irqsave(&u->ready_lock, flags);
+		if (u->ready_num && !list_empty(&u->ready_list)) {
+			k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+			kevent_unlink_ready(k);
+		}
+		spin_unlock_irqrestore(&u->ready_lock, flags);
+	}
+
+	return k;
+}
+
+static struct kevent *kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	while (u->ready_num && !k) {
+		k = __kevent_dequeue_ready_one(u);
+
+		if (k && (k->event.req_flags & KEVENT_REQ_LAST_CHECK)) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&k->ulock, flags);
+			k->event.req_flags &= ~KEVENT_REQ_LAST_CHECK;
+			spin_unlock_irqrestore(&k->ulock, flags);
+
+			if (!k->callbacks.callback(k)) {
+				spin_lock_irqsave(&k->ulock, flags);
+				k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+				k->event.ret_flags = 0;
+				k->event.ret_data[0] = k->event.ret_data[1] = 0;
+				spin_unlock_irqrestore(&k->ulock, flags);
+				k = NULL;
+			}
+		} else
+			break;
+	}
+
+	return k;
+}
+
+static inline void kevent_copy_ring(struct kevent *k)
+{
+	unsigned long flags;
+
+	if (!k)
+		return;
+
+	if (kevent_copy_ring_buffer(k)) {
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kevent_dequeue_ready(struct kevent_user *u)
+{
+	struct kevent *k;
+
+	mutex_lock(&u->ring_lock);
+	k = kevent_dequeue_ready_one(u);
+	kevent_copy_ring(k);
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+/*
+ * Dequeue one entry from user's ready queue if there is space in ring buffer.
+ */
+static struct kevent *kevent_dequeue_ready_ring(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	mutex_lock(&u->ring_lock);
+	if (kevent_ring_space(u)) {
+		k = kevent_dequeue_ready_one(u);
+		kevent_copy_ring(k);
+	}
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+static void kevent_complete_ready(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		kevent_finish_user(k, 1);
+	else if (k->event.req_flags & KEVENT_REQ_ET) {
+		unsigned long flags;
+
+		/*
+		 * Edge-triggered behaviour: mark event as clear new one.
+		 */
+
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags = 0;
+		k->event.ret_data[0] = k->event.ret_data[1] = 0;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Search a kevent inside kevent tree for given ukevent.
+ */
+static struct kevent *__kevent_search(struct kevent_id *id, __u32 type, struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	struct rb_node *n = u->kevent_root.rb_node;
+	int cmp;
+
+	while (n) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		cmp = kevent_compare_id(&k->event.id, id, k->event.type, type);
+
+		if (cmp > 0)
+			n = n->rb_right;
+		else if (cmp < 0)
+			n = n->rb_left;
+		else {
+			ret = k;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, uk->type, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, uk->type, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k;
+	struct rb_node *n;
+
+	for (n = rb_first(&u->kevent_root); n; n = rb_next(n)) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_mark_ready(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, uk->type, u);
+	if (k) {
+		spin_lock(&k->st->lock);
+		kevent_ready(k, 1);
+		spin_unlock(&k->st->lock);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Mark appropriate kevents as ready.
+ * If number of events is zero just wake up one listener.
+ */
+static int kevent_user_ctl_ready(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	if (num > u->kevent_num)
+		return err;
+	
+	if (!num) {
+		u->need_exit = 1;
+		wake_up(&u->wait);
+		return 0;
+	}
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				err = kevent_mark_ready(&ukev[i], u);
+				if (err) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_mark_ready(&uk, u);
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = num - rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static int kevent_user_enqueue(struct kevent_user *u, struct kevent *new)
+{
+	unsigned long flags;
+	struct rb_node **p = &u->kevent_root.rb_node, *parent = NULL;
+	struct kevent *k;
+	int err = 0, cmp;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	while (*p) {
+		parent = *p;
+		k = rb_entry(parent, struct kevent, kevent_node);
+
+		cmp = kevent_compare_id(&k->event.id, &new->event.id,
+				k->event.type, new->event.type);
+		if (cmp > 0)
+			p = &parent->rb_right;
+		else if (cmp < 0)
+			p = &parent->rb_left;
+		else {
+			err = -EEXIST;
+			break;
+		}
+	}
+	if (likely(!err)) {
+		rb_link_node(&new->kevent_node, parent, p);
+		rb_insert_color(&new->kevent_node, &u->kevent_root);
+		new->flags |= KEVENT_USER;
+		u->kevent_num++;
+		kevent_user_get(u);
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	err = kevent_user_enqueue(u, k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		kevent_finish_user(k, 0);
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	if (err < 0) {
+		uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE;
+		uk->ret_data[1] = err;
+	} else if (err > 0)
+		uk->ret_flags |= KEVENT_RET_DONE;
+	return err;
+}
+
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one
+ * and add them into appropriate kevent_storages,
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (!kevent_event_is_allowed(&ukev[i])) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], 
+							&ukev[i], 
+							sizeof(struct ukevent));
+					rnum++;
+				} else {
+					err = kevent_user_add_ukevent(&ukev[i], u);
+					if (err) {
+						kevent_stat_im(u);
+						if (i != rnum)
+							memcpy(&ukev[rnum], 
+								&ukev[i], 
+								sizeof(struct ukevent));
+						rnum++;
+					}
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		if (!kevent_event_is_allowed(&uk)) {
+			err = 1;
+		} else {
+			err = kevent_user_add_ukevent(&uk, u);
+			if (err)
+				kevent_stat_im(u);
+		}
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/* Used to wakeup waiting syscalls in case high-resolution timer is used. */
+static int kevent_user_wake(struct hrtimer *timer)
+{
+	struct kevent_user *u = container_of(timer, struct kevent_user, timer);
+
+	u->need_exit = 1;
+	wake_up(&u->wait);
+
+	return HRTIMER_NORESTART;
+}
+
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u,
+		unsigned int min_nr, unsigned int max_nr, struct timespec timeout,
+		void __user *buf, unsigned int flags)
+{
+	struct kevent *k;
+	int num = 0;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			return -EINVAL;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || u->need_exit, tm);
+	}
+	u->need_exit = 0;
+
+	while (num < max_nr && ((k = kevent_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent),
+					&k->event, sizeof(struct ukevent))) {
+			if (num == 0)
+				num = -EFAULT;
+			break;
+		}
+		kevent_complete_ready(k);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+struct file_operations kevent_user_fops = {
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	case KEVENT_CTL_READY:
+		err = kevent_user_ctl_ready(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - time to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf, flags);
+out_fput:
+	fput(file);
+	return err;
+}
+
+static struct vfsmount *kevent_mnt __read_mostly;
+
+static int kevent_get_sb(struct file_system_type *fs_type, int flags,
+		   const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "kevent", NULL, 0xaabbccdd, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= "keventfs",
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int keventfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations keventfs_dentry_operations = {
+	.d_delete	= keventfs_delete_dentry,
+};
+
+asmlinkage long sys_kevent_init(struct kevent_ring __user *ring, unsigned int num, unsigned int flags)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int err = -ENFILE, fd;
+	struct kevent_user *u;
+
+	if ((ring && !num) || (!ring && num) || (num == 1))
+		return -EINVAL;
+
+	file = get_empty_filp();
+	if (!file)
+		goto err_out_exit;
+
+	inode = new_inode(kevent_mnt->mnt_sb);
+	if (!inode)
+		goto err_out_fput;
+
+	inode->i_fop = &kevent_user_fops;
+
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	err = get_unused_fd();
+	if (err < 0)
+		goto err_out_iput;
+	fd = err;
+
+	err = -ENOMEM;
+	u = kevent_user_alloc(ring, num);
+	if (!u)
+		goto err_out_put_fd;
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(kevent_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_out_free;
+	dentry->d_op = &keventfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_pos = 0;
+	file->f_flags = O_RDONLY;
+	file->f_op = &kevent_user_fops;
+	file->f_mode = FMODE_READ;
+	file->f_version = 0;
+	file->private_data = u;
+
+	fd_install(fd, file);
+
+	return fd;
+
+err_out_free:
+	kmem_cache_free(kevent_user_cache, u);
+err_out_put_fd:
+	put_unused_fd(fd);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	put_filp(file);
+err_out_exit:
+	return err;
+}
+
+/*
+ * Commits user's index (consumer index).
+ * Must be called under u->ring_lock mutex held.
+ */
+static int __kevent_user_commit(struct kevent_user *u, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EOVERFLOW, comm = 0;
+	struct kevent_ring __user *ring = u->pring;
+
+	if (!ring) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	if (new_uidx >= u->ring_size) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if ((over != u->ring_over - 1) && (over != u->ring_over))
+		goto err_out_exit;
+
+	if (u->uidx < u->kidx && new_uidx > u->kidx) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if (new_uidx > u->uidx) {
+		if (over != u->ring_over)
+			goto err_out_exit;
+
+		comm = new_uidx - u->uidx;
+		u->uidx = new_uidx;
+		u->full = 0;
+	} else if (new_uidx < u->uidx) {
+		comm = u->ring_size - (u->uidx - new_uidx);
+		u->uidx = new_uidx;
+		u->full = 0;
+		u->ring_over++;
+
+		if (put_user(u->ring_over, &ring->ring_over)) {
+			err = -EFAULT;
+			goto err_out_exit;
+		}
+	}
+
+	return comm;
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * This syscall is used to perform waiting until there is free space in the ring
+ * buffer, in that case some events will be copied there.
+ * Function returns number of actually copied ready events in ring buffer.
+ * After this function is completed userspace ring->ring_kidx will be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @num - number of kevents to process.
+ * @old_uidx - the last index user is aware of.
+ * @timeout - time to wait until there is free space in kevent queue.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ *
+ * When we need to commit @num events, it means we should just remove first @num
+ * kevents from ready queue and copy them into the buffer. 
+ * Kevents will be copied into ring buffer in order they were placed into ready queue.
+ * One-shot kevents will be removed here, since there is no way they can be reused.
+ * Edge-triggered events will be requeued here for better performance.
+ */
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags)
+{
+	int err = -EINVAL, copied = 0;
+	struct file *file;
+	struct kevent_user *u;
+	struct kevent *k;
+	struct kevent_ring __user *ring;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+	unsigned int i;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	ring = u->pring;
+	if (!ring || num > u->ring_size)
+		goto out_fput;
+#if 0
+	/*
+	 * Allow to immediately update ring index, but it is not supported,
+	 * since syscall() has limited number of arguments which is actually
+	 * a good idea - use kevent_commit() instead.
+	 */
+	if ((u->uidx != new_uidx) && (new_uidx != 0xffffffff)) {
+		mutex_lock(&u->ring_lock);
+		__kevent_user_commit(u, new_uidx, over);
+		mutex_unlock(&u->ring_lock);
+	}
+#endif
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			goto out_fput;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || 
+				u->need_exit || old_uidx != u->uidx,
+				tm);
+	}
+	u->need_exit = 0;
+
+	for (i=0; i<num; ++i) {
+		k = kevent_dequeue_ready_ring(u);
+		if (!k)
+			break;
+		kevent_complete_ready(k);
+
+		if (k->event.ret_flags & KEVENT_RET_COPY_FAILED)
+			break;
+		kevent_stat_ring(u);
+		copied++;
+	}
+
+	fput(file);
+
+	return copied;
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to commit events in ring buffer, i.e. mark appropriate
+ * entries as unused by userspace so subsequent kevent_wait() could overwrite them.
+ * This fucntion returns actual number of kevents which were committed.
+ * After this function is completed userspace ring->ring_over can be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @new_uidx - the last committed kevent.
+ * @over - number of overflows given queue had.
+ */
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EINVAL, comm = 0;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	mutex_lock(&u->ring_lock);
+	err = __kevent_user_commit(u, new_uidx, over);
+	if (err < 0)
+		goto err_out_unlock;
+	comm = err;
+	mutex_unlock(&u->ring_lock);
+
+	fput(file);
+
+	return comm;
+
+err_out_unlock:
+	mutex_unlock(&u->ring_lock);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create caches and register
+ * filesystem to get control file descriptors from.
+ */
+static int __init kevent_user_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache",
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+	
+	kevent_user_cache = kmem_cache_create("kevent_user_cache",
+			sizeof(struct kevent_user), 0, SLAB_PANIC, NULL, NULL);
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		goto err_out_exit;
+	
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	err = PTR_ERR(kevent_mnt);
+	if (IS_ERR(kevent_mnt))
+		goto err_out_unreg;
+
+	printk(KERN_INFO "KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_unreg:
+	unregister_filesystem(&kevent_fs_type);
+err_out_exit:
+	kmem_cache_destroy(kevent_cache);
+	return err;
+}
+
+module_init(kevent_user_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d7306d0..4546563 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -123,6 +123,12 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+cond_syscall(sys_kevent_wait);
+cond_syscall(sys_kevent_commit);
+cond_syscall(sys_kevent_init);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 6/9] kevent: Pipe notifications.
  2006-12-29 12:25           ` [take30 5/9] kevent: Timer notifications Evgeniy Polyakov
@ 2006-12-29 12:25             ` Evgeniy Polyakov
  2006-12-29 12:25               ` [take30 7/9] kevent: Signal notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Pipe notifications.


diff --git a/fs/pipe.c b/fs/pipe.c
index 68090e8..0c75bf1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -16,6 +16,7 @@
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -313,6 +314,7 @@ redo:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 			wake_up_interruptible_sync(&pipe->wait);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
@@ -322,6 +324,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
@@ -484,6 +487,7 @@ redo2:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 			wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
@@ -495,6 +499,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
@@ -590,6 +595,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 		free_pipe_info(inode);
 	} else {
 		wake_up_interruptible(&pipe->wait);
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
diff --git a/kernel/kevent/kevent_pipe.c b/kernel/kevent/kevent_pipe.c
new file mode 100644
index 0000000..91dc1eb
--- /dev/null
+++ b/kernel/kevent/kevent_pipe.c
@@ -0,0 +1,123 @@
+/*
+ * 	kevent_pipe.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+#include <linux/pipe_fs_i.h>
+
+static int kevent_pipe_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int nrbufs = pipe->nrbufs;
+
+	if (k->event.event & KEVENT_SOCKET_RECV && nrbufs > 0) {
+		if (!pipe->writers)
+			return -1;
+		return 1;
+	}
+	
+	if (k->event.event & KEVENT_SOCKET_SEND && nrbufs < PIPE_BUFFERS) {
+		if (!pipe->readers)
+			return -1;
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_pipe_enqueue(struct kevent *k)
+{
+	struct file *pipe;
+	int err = -EBADF;
+	struct inode *inode;
+
+	pipe = fget(k->event.id.raw[0]);
+	if (!pipe)
+		goto err_out_exit;
+
+	inode = igrab(pipe->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = -EINVAL;
+	if (!S_ISFIFO(inode->i_mode))
+		goto err_out_iput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	fput(pipe);
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(pipe);
+err_out_exit:
+	return err;
+}
+
+int kevent_pipe_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_pipe_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
+
+static int __init kevent_init_pipe(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_pipe_callback,
+		.enqueue = &kevent_pipe_enqueue,
+		.dequeue = &kevent_pipe_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_PIPE);
+}
+module_init(kevent_init_pipe);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 1/9] kevent: Description.
  2006-12-29 12:25 ` [take30 0/9] " Evgeniy Polyakov
@ 2006-12-29 12:25   ` Evgeniy Polyakov
  2006-12-29 12:25     ` [take30 2/9] kevent: Core files Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Description.


diff --git a/Documentation/kevent.txt b/Documentation/kevent.txt
new file mode 100644
index 0000000..95cb36e
--- /dev/null
+++ b/Documentation/kevent.txt
@@ -0,0 +1,244 @@
+Description.
+
+int kevent_init(struct kevent_ring *ring, unsigned int ring_size, 
+	unsigned int flags);
+
+num - size of the ring buffer in events 
+ring - pointer to allocated ring buffer
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value: kevent control file descriptor or negative error value.
+
+ struct kevent_ring
+ {
+   unsigned int ring_kidx, ring_over;
+   struct ukevent event[0];
+ }
+
+ring_kidx - index in the ring buffer where kernel will put new events 
+		when kevent_wait() or kevent_get_events() is called 
+ring_over - number of overflows of ring_uidx happend from the start.
+	Overflow counter is used to prevent situation when two threads 
+	are going to free the same events, but one of them was scheduled 
+	away for too long, so ring indexes were wrapped, so when that 
+	thread will be awakened, it will free not those events, which 
+	it suppose to free.
+
+Example userspace code (ring_buffer.c) can be found on project's homepage.
+
+Each kevent syscall can be so called cancellation point in glibc, i.e. when 
+thread has been cancelled in kevent syscall, thread can be safely removed 
+and no events will be lost, since each syscall (kevent_wait() or 
+kevent_get_events()) will copy event into special ring buffer, accessible 
+from other threads or even processes (if shared memory is used).
+
+When kevent is removed (not dequeued when it is ready, but just removed), 
+even if it was ready, it is not copied into ring buffer, since if it is 
+removed, no one cares about it (otherwise user would wait until it becomes 
+ready and got it through usual way using kevent_get_events() or kevent_wait()) 
+and thus no need to copy it to the ring buffer.
+
+-------------------------------------------------------------------------------
+
+
+int kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent *arg);
+
+fd - is the file descriptor referring to the kevent queue to manipulate. 
+It is created by opening "/dev/kevent" char device, which is created with 
+dynamic minor number and major number assigned for misc devices. 
+
+cmd - is the requested operation. It can be one of the following:
+    KEVENT_CTL_ADD - add event notification 
+    KEVENT_CTL_REMOVE - remove event notification 
+    KEVENT_CTL_MODIFY - modify existing notification 
+    KEVENT_CTL_READY - mark existing events as ready, if number of events is zero,
+    	it just wakes up parked in syscall thread
+
+num - number of struct ukevent in the array pointed to by arg 
+arg - array of struct ukevent
+
+Return value: 
+ number of events processed or negative error value.
+
+When called, kevent_ctl will carry out the operation specified in the 
+cmd parameter.
+-------------------------------------------------------------------------------
+
+ int kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, 
+ 		struct timespec timeout, struct ukevent *buf, unsigned flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+min_nr - minimum number of completed events that kevent_get_events will block 
+	 waiting for 
+max_nr - number of struct ukevent in buf 
+timeout - time to wait before returning less than min_nr 
+	  events. If this is -1, then wait forever. 
+buf - pointer to an array of struct ukevent. 
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied or negative error value.
+
+kevent_get_events will wait timeout milliseconds for at least min_nr completed 
+events, copying completed struct ukevents to buf and deleting any 
+KEVENT_REQ_ONESHOT event requests. In nonblocking mode it returns as many 
+events as possible, but not more than max_nr. In blocking mode it waits until 
+timeout or if at least min_nr events are ready.
+
+This function copies event into ring buffer if it was initialized, if ring buffer
+is full, KEVENT_RET_COPY_FAILED flag is set in ret_flags field.
+-------------------------------------------------------------------------------
+
+ int kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+ 	struct timespec timeout, unsigned int flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+num - number of processed kevents 
+old_uidx - the last index user is aware of
+timeout - time to wait until there is free space in kevent queue
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied into ring buffer or negative error value.
+
+This syscall waits until either timeout expires or at least one event becomes 
+ready. It also copies events into special ring buffer. If ring buffer is full,
+it waits until there are ready events and then return.
+If kevent is one-shot kevent it is removed in this syscall.
+If kevent is edge-triggered (KEVENT_REQ_ET flag is set in 'req_flags') it is 
+requeued in this syscall for performance reasons.
+-------------------------------------------------------------------------------
+
+ int kevent_commit(int ctl_fd, unsigned int new_idx, unsigned int over);
+
+ctl_fd - file descriptor referring to the kevent queue 
+new_uidx - the last committed kevent
+over - overflow count for given $new_idx value
+
+Return value:
+ number of committed kevents or negative error value.
+
+This function commits, i.e. marks as empty, slots in the ring buffer, so
+they can be reused when userspace completes that entries processing.
+
+Overflow counter is used to prevent situation when two threads are going 
+to free the same events, but one of them was scheduled away for too long, 
+so ring indexes were wrapped, so when that thread will be awakened, it 
+will free not those events, which it suppose to free.
+
+It is possible that returned number of committed events will be smaller than
+requested number - it is possible when several threads try to commit the
+same events.
+-------------------------------------------------------------------------------
+
+The bulk of the interface is entirely done through the ukevent struct. 
+It is used to add event requests, modify existing event requests, 
+specify which event requests to remove, and return completed events.
+
+struct ukevent contains the following members:
+
+struct kevent_id id
+    Id of this request, e.g. socket number, file descriptor and so on 
+__u32 type
+    Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on 
+__u32 event
+    Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED 
+__u32 req_flags
+    Per-event request flags,
+
+    KEVENT_REQ_ONESHOT
+        event will be removed when it is ready 
+
+    KEVENT_REQ_WAKEUP_ALL
+        Kevent wakes up only first thread interested in given event, 
+	or all threads if this flag is set.
+
+    KEVENT_REQ_ET
+        Edge Triggered behaviour. It is an optimisation which allows to move 
+	ready and dequeued (i.e. copied to userspace) event to move into set 
+	of interest for given storage (socket, inode and so on) again. It is 
+	very usefull for cases when the same event should be used many times 
+	(like reading from pipe). It is similar to epoll()'s EPOLLET flag. 
+
+    KEVENT_REQ_LAST_CHECK
+        if set allows to perform the last check on kevent (call appropriate 
+	callback) when kevent is marked as ready and has been removed from 
+	ready queue. If it will be confirmed that kevent is ready 
+	(k->callbacks.callback(k) returns true) then kevent will be copied 
+	to userspace, otherwise it will be requeued back to storage. 
+	Second (checking) call is performed with this bit cleared, so callback 
+	can detect when it was called from kevent_storage_ready() - bit is set, 
+	or kevent_dequeue_ready() - bit is cleared. If kevent will be requeued, 
+	bit will be set again.
+
+   KEVENT_REQ_ALWAYS_QUEUE
+        If this flag is set kevent will be queued into ready queue if it is 
+	ready at enqueue time, otherwise it will be copied back to userspace
+	and will not be queued into the storage.
+
+   KEVENT_REQ_READY
+   	If this flag is set, kevent will be marked as ready immediately at enqueue
+	time.
+
+__u32 ret_flags
+    Per-event return flags
+
+    KEVENT_RET_BROKEN
+        Kevent is broken 
+
+    KEVENT_RET_DONE
+        Kevent processing was finished successfully 
+
+    KEVENT_RET_COPY_FAILED
+        Kevent was not copied into ring buffer due to some error conditions. 
+
+__u32 ret_data
+    Event return data. Event originator fills it with anything it likes 
+    (for example timer notifications put number of milliseconds when timer 
+    has fired 
+union { __u32 user[2]; void *ptr; }
+    User's data. It is not used, just copied to/from user. The whole structure 
+    is aligned to 8 bytes already, so the last union is aligned properly. 
+
+-------------------------------------------------------------------------------
+
+Kevent waiting syscall flags.
+
+KEVENT_FLAGS_ABSTIME - provided timespec parameter contains absolute time, 
+	for example Aug 27, 2194, or time(NULL) + 10.
+
+-------------------------------------------------------------------------------
+
+Usage
+
+For KEVENT_CTL_ADD, all fields relevant to the event type must be filled 
+(id, type, event, req_flags). 
+After kevent_ctl(..., KEVENT_CTL_ADD, ...) returns each struct's ret_flags 
+should be checked to see if the event is already broken or done.
+
+For KEVENT_CTL_MODIFY, the id, req_flags, and user and event fields must be 
+set and an existing kevent request must have matching id and user fields. If 
+match is found, req_flags and event are replaced with the newly supplied 
+values and requeueing is started, so modified kevent can be checked and 
+probably marked as ready immediately. If a match can't be found, the 
+passed in ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is 
+always set.
+
+For KEVENT_CTL_REMOVE, the id and user fields must be set and an existing 
+kevent request must have matching id and user fields. If a match is found, 
+the kevent request is removed. If a match can't be found, the passed in 
+ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is always set.
+
+For kevent_get_events, the entire structure is returned.
+
+-------------------------------------------------------------------------------
+
+Usage cases
+
+kevent_timer
+struct ukevent should contain following fields:
+    type - KEVENT_TIMER 
+    event - KEVENT_TIMER_FIRED 
+    req_flags - KEVENT_REQ_ONESHOT if you want to fire that timer only once 
+    id.raw[0] - number of seconds after commit when this timer shout expire 
+    id.raw[0] - additional to number of seconds number of nanoseconds 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 4/9] kevent: Socket notifications.
  2006-12-29 12:25       ` [take30 3/9] kevent: poll/select() notifications Evgeniy Polyakov
@ 2006-12-29 12:25         ` Evgeniy Polyakov
  2006-12-29 12:25           ` [take30 5/9] kevent: Timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Socket notifications.

This patch includes socket send/recv/accept notifications.
Using trivial web server based on kevent and this features
instead of epoll it's performance increased more than noticebly.
More details about various benchmarks and server itself 
(evserver_kevent.c) can be found on project's homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/inode.c b/fs/inode.c
index bf21dc6..82817b1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -164,12 +165,18 @@ static struct inode *alloc_inode(struct super_block *sb)
 		}
 		inode->i_private = NULL;
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/include/net/sock.h b/include/net/sock.h
index 03684e7..d840399 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -49,6 +49,7 @@
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/mm.h>
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -451,6 +452,21 @@ static inline int sk_stream_memory_free(struct sock *sk)
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -478,6 +494,7 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -679,21 +696,6 @@ static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b7d8317..2763b30 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -864,6 +864,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..d1a2701
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,144 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	unsigned int events = SOCKET_I(inode)->ops->poll(SOCKET_I(inode)->file, SOCKET_I(inode), NULL);
+
+	if ((events & (POLLIN | POLLRDNORM)) && (k->event.event & (KEVENT_SOCKET_RECV | KEVENT_SOCKET_ACCEPT)))
+		return 1;
+	if ((events & (POLLOUT | POLLWRNORM)) && (k->event.event & KEVENT_SOCKET_SEND))
+		return 1;
+	if (events & (POLLERR | POLLHUP | POLLRDHUP | POLLREMOVE))
+		return -1;
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct inode *inode;
+	struct socket *sock;
+	int err = -EBADF;
+
+	sock = sockfd_lookup(k->event.id.raw[0], &err);
+	if (!sock)
+		goto err_out_exit;
+
+	inode = igrab(SOCK_INODE(sock));
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	sockfd_put(sock);
+err_out_exit:
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct socket *sock;
+
+	kevent_storage_dequeue(k->st, k);
+
+	sock = SOCKET_I(inode);
+	iput(inode);
+	sockfd_put(sock);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket)
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+}
+
+/*
+ * It is required for network protocols compiled as modules, like IPv6.
+ */
+EXPORT_SYMBOL_GPL(kevent_socket_notify);
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key kevent_sock_key;
+
+void kevent_socket_reinit(struct socket *sock)
+{
+	struct inode *inode = SOCK_INODE(sock);
+
+	lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+}
+
+void kevent_sk_reinit(struct sock *sk)
+{
+	if (sk->sk_socket) {
+		struct inode *inode = SOCK_INODE(sk->sk_socket);
+
+		lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+	}
+}
+#endif
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_socket_callback,
+		.enqueue = &kevent_socket_enqueue,
+		.dequeue = &kevent_socket_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SOCKET);
+}
+module_init(kevent_init_socket);
diff --git a/net/core/sock.c b/net/core/sock.c
index 0ed5b4f..e687f54 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1393,6 +1393,7 @@ static void sock_def_wakeup(struct sock *sk)
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1402,6 +1403,7 @@ static void sock_def_error_report(struct sock *sk)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1411,6 +1413,7 @@ static void sock_def_readable(struct sock *sk, int len)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1430,6 +1433,7 @@ static void sock_def_write_space(struct sock *sk)
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1480,6 +1484,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state		=	TCP_CLOSE;
 	sk->sk_socket		=	sock;
 
+	kevent_sk_reinit(sk);
+
 	sock_set_flag(sk, SOCK_ZAPPED);
 
 	if(sock)
@@ -1546,8 +1552,10 @@ void fastcall release_sock(struct sock *sk)
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c701f6a..84ce4c5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3127,6 +3127,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bf7a224..bca9f2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -1391,6 +1392,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
diff --git a/net/socket.c b/net/socket.c
index 4e39631..776dc2e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -84,6 +84,7 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -496,6 +497,8 @@ static struct socket *sock_alloc(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 
+	kevent_socket_reinit(sock);
+
 	get_cpu_var(sockets_in_use)++;
 	put_cpu_var(sockets_in_use);
 	return sock;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2f208c7..835e20f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1563,8 +1563,10 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct scm_cookie tmp_scm;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
+	struct sock *other;
 	int noblock = flags & MSG_DONTWAIT;
 	struct sk_buff *skb;
+
 	int err;
 
 	err = -EOPNOTSUPP;
@@ -1580,6 +1582,12 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		goto out_unlock;
 
 	wake_up_interruptible(&u->peer_wait);
+	other =unix_peer_get(sk);
+	if (other) {
+		kevent_socket_notify(other, KEVENT_SOCKET_SEND);
+		sock_put(other);
+	} else
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -1674,7 +1682,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 {
 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
 	struct scm_cookie tmp_scm;
-	struct sock *sk = sock->sk;
+	struct sock *sk = sock->sk, *other;
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=msg->msg_name;
 	int copied = 0;
@@ -1803,6 +1811,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 	} while (size);
 
+	other =unix_peer_get(sk);
+	if (other) {
+		kevent_socket_notify(other, KEVENT_SOCKET_SEND);
+		sock_put(other);
+	}
+	if (sk->sk_shutdown & RCV_SHUTDOWN || !other)
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
+
 	mutex_unlock(&u->readlock);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
@@ -1824,6 +1840,10 @@ static int unix_shutdown(struct socket *sock, int mode)
 			sock_hold(other);
 		unix_state_wunlock(sk);
 		sk->sk_state_change(sk);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
+		if (other)
+			kevent_socket_notify(other, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 
 		if (other &&
 			(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 5/9] kevent: Timer notifications.
  2006-12-29 12:25         ` [take30 4/9] kevent: Socket notifications Evgeniy Polyakov
@ 2006-12-29 12:25           ` Evgeniy Polyakov
  2006-12-29 12:25             ` [take30 6/9] kevent: Pipe notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Timer notifications.

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

This subsystem uses high-resolution timers.
id.raw[0] is used as number of seconds
id.raw[1] is used as number of nanoseconds

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..c21a155
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,114 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+struct kevent_timer
+{
+	struct hrtimer		ktimer;
+	struct kevent_storage	ktimer_storage;
+	struct kevent		*ktimer_event;
+};
+
+static int kevent_timer_func(struct hrtimer *timer)
+{
+	struct kevent_timer *t = container_of(timer, struct kevent_timer, ktimer);
+	struct kevent *k = t->ktimer_event;
+
+	kevent_storage_ready(&t->ktimer_storage, NULL, KEVENT_MASK_ALL);
+	hrtimer_forward(timer, timer->base->softirq_time,
+			ktime_set(k->event.id.raw[0], k->event.id.raw[1]));
+	return HRTIMER_RESTART;
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	int err;
+	struct kevent_timer *t;
+
+	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	hrtimer_init(&t->ktimer, CLOCK_MONOTONIC, HRTIMER_REL);
+	t->ktimer.expires = ktime_set(k->event.id.raw[0], k->event.id.raw[1]);
+	t->ktimer.function = kevent_timer_func;
+	t->ktimer_event = k;
+
+	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(&t->ktimer_storage, k);
+	if (err)
+		goto err_out_st_fini;
+
+	hrtimer_start(&t->ktimer, t->ktimer.expires, HRTIMER_REL);
+
+	return 0;
+
+err_out_st_fini:
+	kevent_storage_fini(&t->ktimer_storage);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
+
+	hrtimer_cancel(&t->ktimer);
+	kevent_storage_dequeue(st, k);
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	k->event.ret_data[0] = jiffies_to_msecs(jiffies);
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &kevent_timer_callback,
+		.enqueue = &kevent_timer_enqueue,
+		.dequeue = &kevent_timer_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&tc, KEVENT_TIMER);
+}
+module_init(kevent_init_timer);
+


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 9/9] kevent: Private userspace notifications.
  2006-12-29 12:25                 ` [take30 8/9] kevent: Kevent posix timer notifications Evgeniy Polyakov
@ 2006-12-29 12:25                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Private userspace notifications.

Allows to register notifications of any private userspace
events over kevent. Events can be marked as readt using 
kevent_ctl(KEVENT_READY) command.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/kernel/kevent/kevent_unotify.c b/kernel/kevent/kevent_unotify.c
new file mode 100644
index 0000000..618c09c
--- /dev/null
+++ b/kernel/kevent/kevent_unotify.c
@@ -0,0 +1,62 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/kevent.h>
+
+static int kevent_unotify_callback(struct kevent *k)
+{
+	return 1;
+}
+
+int kevent_unotify_enqueue(struct kevent *k)
+{
+	int err;
+
+	err = kevent_storage_enqueue(&k->user->st, k);
+	if (err)
+		goto err_out_exit;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE)
+		kevent_requeue(k);
+
+	return 0;
+
+err_out_exit:
+	return err;
+}
+
+int kevent_unotify_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+	return 0;
+}
+
+static int __init kevent_init_unotify(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_unotify_callback,
+		.enqueue = &kevent_unotify_enqueue,
+		.dequeue = &kevent_unotify_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_UNOTIFY);
+}
+module_init(kevent_init_unotify);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 7/9] kevent: Signal notifications.
  2006-12-29 12:25             ` [take30 6/9] kevent: Pipe notifications Evgeniy Polyakov
@ 2006-12-29 12:25               ` Evgeniy Polyakov
  2006-12-29 12:25                 ` [take30 8/9] kevent: Kevent posix timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Signal notifications.

This type of notifications allows to deliver signals through kevent queue.
One can find example application signal.c on project homepage.

If KEVENT_SIGNAL_NOMASK bit is set in raw_u64 id then signal will be
delivered only through queue, otherwise both delivery types are used - old
through update of mask of pending signals and through queue.

If signal is delivered only through kevent queue mask of pending signals
is not updated at all, which is equal to putting signal into blocked mask,
but with delivery of that signal through kevent queue.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4463735..e7372f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -82,6 +82,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
 #include <linux/task_io_accounting.h>
 
 #include <asm/processor.h>
@@ -1048,6 +1049,10 @@ struct task_struct {
 #ifdef	CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info *delays;
 #endif
+#ifdef CONFIG_KEVENT_SIGNAL
+	struct kevent_storage st;
+	u32 kevent_signals;
+#endif
 #ifdef CONFIG_FAULT_INJECTION
 	int make_it_fail;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index fc723e5..fd7c749 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
+#include <linux/kevent.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -118,6 +119,9 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_fini(&tsk->st);
+#endif
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -1126,6 +1130,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_init(p, &p->st);
+#endif
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/kevent/kevent_signal.c b/kernel/kevent/kevent_signal.c
new file mode 100644
index 0000000..abe3972
--- /dev/null
+++ b/kernel/kevent/kevent_signal.c
@@ -0,0 +1,94 @@
+/*
+ * 	kevent_signal.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+
+static int kevent_signal_callback(struct kevent *k)
+{
+	struct task_struct *tsk = k->st->origin;
+	int sig = k->event.id.raw[0];
+	int ret = 0;
+
+	if (sig == tsk->kevent_signals)
+		ret = 1;
+
+	if (ret && (k->event.id.raw_u64 & KEVENT_SIGNAL_NOMASK))
+		tsk->kevent_signals |= 0x80000000;
+
+	return ret;
+}
+
+int kevent_signal_enqueue(struct kevent *k)
+{
+	int err;
+
+	err = kevent_storage_enqueue(&current->st, k);
+	if (err)
+		goto err_out_exit;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_exit:
+	return err;
+}
+
+int kevent_signal_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+	return 0;
+}
+
+int kevent_signal_notify(struct task_struct *tsk, int sig)
+{
+	tsk->kevent_signals = sig;
+	kevent_storage_ready(&tsk->st, NULL, KEVENT_SIGNAL_DELIVERY);
+	return (tsk->kevent_signals & 0x80000000);
+}
+
+static int __init kevent_init_signal(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_signal_callback,
+		.enqueue = &kevent_signal_enqueue,
+		.dequeue = &kevent_signal_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SIGNAL);
+}
+module_init(kevent_init_signal);
diff --git a/kernel/signal.c b/kernel/signal.c
index 5630255..f12ebc0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/kevent.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
@@ -714,6 +715,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
 	struct sigqueue * q = NULL;
 	int ret = 0;
+	
+	if (kevent_signal_notify(t, sig))
+		return 1;
 
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
@@ -793,6 +797,17 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	ret = send_signal(sig, info, t, &t->pending);
 	if (!ret && !sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 out:
 	return ret;
 }
@@ -982,6 +997,17 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	 * to avoid several races.
 	 */
 	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 	if (unlikely(ret))
 		return ret;
 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take30 8/9] kevent: Kevent posix timer notifications.
  2006-12-29 12:25               ` [take30 7/9] kevent: Signal notifications Evgeniy Polyakov
@ 2006-12-29 12:25                 ` Evgeniy Polyakov
  2006-12-29 12:25                   ` [take30 9/9] kevent: Private userspace notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 12:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Kevent posix timer notifications.

Simple extensions to POSIX timers which allows
to deliver notification of the timer expiration
through kevent queue.

Example application posix_timer.c can be found
in archive on project homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 8786e01..3768746 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -235,6 +235,7 @@ typedef struct siginfo {
 #define SIGEV_NONE	1	/* other notification: meaningless */
 #define SIGEV_THREAD	2	/* deliver via thread creation */
 #define SIGEV_THREAD_ID 4	/* deliver to thread */
+#define SIGEV_KEVENT	8	/* deliver through kevent queue */
 
 /*
  * This works because the alignment is ok on all current architectures
@@ -260,6 +261,8 @@ typedef struct sigevent {
 			void (*_function)(sigval_t);
 			void *_attribute;	/* really pthread_attr_t */
 		} _sigev_thread;
+
+		int kevent_fd;
 	} _sigev_un;
 } sigevent_t;
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f..4b9deb4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/kevent_storage.h>
 
 union cpu_time_count {
 	cputime_t cpu;
@@ -49,6 +50,9 @@ struct k_itimer {
 	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
+#ifdef CONFIG_KEVENT_TIMER
+	struct kevent_storage st;
+#endif
 	union {
 		struct {
 			struct hrtimer timer;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5fe87de..5ec805e 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,6 +48,8 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/module.h>
+#include <linux/kevent.h>
+#include <linux/file.h>
 
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
@@ -224,6 +226,100 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+#ifdef CONFIG_KEVENT_TIMER
+static int posix_kevent_enqueue(struct kevent *k)
+{
+	/*
+	 * It is not ugly - there is no pointer in the id field union, 
+	 * but its size is 64bits, which is ok for any known pointer size.
+	 */
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	return kevent_storage_enqueue(&tmr->st, k);
+}
+static int posix_kevent_dequeue(struct kevent *k)
+{
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	kevent_storage_dequeue(&tmr->st, k);
+	return 0;
+}
+static int posix_kevent_callback(struct kevent *k)
+{
+	return 1;
+}
+static int posix_kevent_init(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &posix_kevent_callback,
+		.enqueue = &posix_kevent_enqueue,
+		.dequeue = &posix_kevent_dequeue,
+		.flags = KEVENT_CALLBACKS_KERNELONLY};
+
+	return kevent_add_callbacks(&tc, KEVENT_POSIX_TIMER);
+}
+
+extern struct file_operations kevent_user_fops;
+
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	struct ukevent uk;
+	struct file *file;
+	struct kevent_user *u;
+	int err;
+
+	file = fget(fd);
+	if (!file) {
+		err = -EBADF;
+		goto err_out;
+	}
+
+	if (file->f_op != &kevent_user_fops) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	u = file->private_data;
+
+	memset(&uk, 0, sizeof(struct ukevent));
+
+	uk.event = KEVENT_MASK_ALL;
+	uk.type = KEVENT_POSIX_TIMER;
+	uk.id.raw_u64 = (unsigned long)(tmr); /* Just cast to something unique */
+	uk.req_flags = KEVENT_REQ_ONESHOT | KEVENT_REQ_ALWAYS_QUEUE;
+	uk.ptr = tmr->it_sigev_value.sival_ptr;
+
+	err = kevent_user_add_ukevent(&uk, u);
+	if (err)
+		goto err_out_fput;
+
+	fput(file);
+
+	return 0;
+
+err_out_fput:
+	fput(file);
+err_out:
+	return err;
+}
+
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+	kevent_storage_fini(&tmr->st);
+}
+#else
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	return -ENOSYS;
+}
+static int posix_kevent_init(void)
+{
+	return 0;
+}
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+}
+#endif
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -241,6 +337,11 @@ static __init int init_posix_timers(void)
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
+	if (posix_kevent_init()) {
+		printk(KERN_ERR "Failed to initialize kevent posix timers.\n");
+		BUG();
+	}
+
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, 0, NULL, NULL);
 	idr_init(&posix_timers_id);
@@ -343,23 +444,29 @@ static int posix_timer_fn(struct hrtimer *timer)
 
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
+	
+	if (timr->it_sigev_notify == SIGEV_KEVENT) {
+#ifdef CONFIG_KEVENT_TIMER
+		kevent_storage_ready(&timr->st, NULL, KEVENT_MASK_ALL);
+#endif
+	} else {
+		if (timr->it.real.interval.tv64 != 0)
+			si_private = ++timr->it_requeue_pending;
 
-	if (timr->it.real.interval.tv64 != 0)
-		si_private = ++timr->it_requeue_pending;
-
-	if (posix_timer_event(timr, si_private)) {
-		/*
-		 * signal was not sent because of sig_ignor
-		 * we will not get a call back to restart it AND
-		 * it should be restarted.
-		 */
-		if (timr->it.real.interval.tv64 != 0) {
-			timr->it_overrun +=
-				hrtimer_forward(timer,
-						timer->base->softirq_time,
-						timr->it.real.interval);
-			ret = HRTIMER_RESTART;
-			++timr->it_requeue_pending;
+		if (posix_timer_event(timr, si_private)) {
+			/*
+			 * signal was not sent because of sig_ignor
+			 * we will not get a call back to restart it AND
+			 * it should be restarted.
+			 */
+			if (timr->it.real.interval.tv64 != 0) {
+				timr->it_overrun +=
+					hrtimer_forward(timer,
+							timer->base->softirq_time,
+							timr->it.real.interval);
+				ret = HRTIMER_RESTART;
+				++timr->it_requeue_pending;
+			}
 		}
 	}
 
@@ -407,6 +514,9 @@ static struct k_itimer * alloc_posix_timer(void)
 		kmem_cache_free(posix_timers_cache, tmr);
 		tmr = NULL;
 	}
+#ifdef CONFIG_KEVENT_TIMER
+	kevent_storage_init(tmr, &tmr->st);
+#endif
 	return tmr;
 }
 
@@ -424,6 +534,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	if (unlikely(tmr->it_process) &&
 	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 		put_task_struct(tmr->it_process);
+	posix_kevent_fini_timer(tmr);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -496,40 +607,52 @@ sys_timer_create(const clockid_t which_clock,
 		new_timer->it_sigev_signo = event.sigev_signo;
 		new_timer->it_sigev_value = event.sigev_value;
 
-		read_lock(&tasklist_lock);
-		if ((process = good_sigevent(&event))) {
-			/*
-			 * We may be setting up this process for another
-			 * thread.  It may be exiting.  To catch this
-			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the siglock will catch
-			 * him before it is too late (in exit_itimers).
-			 *
-			 * The exec case is a bit more invloved but easy
-			 * to code.  If the process is in our thread
-			 * group (and it must be or we would not allow
-			 * it here) and is doing an exec, it will cause
-			 * us to be killed.  In this case it will wait
-			 * for us to die which means we can finish this
-			 * linkage with our last gasp. I.e. no code :)
-			 */
+		if (event.sigev_notify == SIGEV_KEVENT) {
+			error = posix_kevent_init_timer(new_timer, event._sigev_un.kevent_fd);
+			if (error)
+				goto out;
+
+			process = current->group_leader;
 			spin_lock_irqsave(&process->sighand->siglock, flags);
-			if (!(process->flags & PF_EXITING)) {
-				new_timer->it_process = process;
-				list_add(&new_timer->list,
-					 &process->signal->posix_timers);
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-					get_task_struct(process);
-			} else {
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				process = NULL;
+			new_timer->it_process = process;
+			list_add(&new_timer->list, &process->signal->posix_timers);
+			spin_unlock_irqrestore(&process->sighand->siglock, flags);
+		} else {
+			read_lock(&tasklist_lock);
+			if ((process = good_sigevent(&event))) {
+				/*
+				 * We may be setting up this process for another
+				 * thread.  It may be exiting.  To catch this
+				 * case the we check the PF_EXITING flag.  If
+				 * the flag is not set, the siglock will catch
+				 * him before it is too late (in exit_itimers).
+				 *
+				 * The exec case is a bit more invloved but easy
+				 * to code.  If the process is in our thread
+				 * group (and it must be or we would not allow
+				 * it here) and is doing an exec, it will cause
+				 * us to be killed.  In this case it will wait
+				 * for us to die which means we can finish this
+				 * linkage with our last gasp. I.e. no code :)
+				 */
+				spin_lock_irqsave(&process->sighand->siglock, flags);
+				if (!(process->flags & PF_EXITING)) {
+					new_timer->it_process = process;
+					list_add(&new_timer->list,
+						 &process->signal->posix_timers);
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+						get_task_struct(process);
+				} else {
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					process = NULL;
+				}
+			}
+			read_unlock(&tasklist_lock);
+			if (!process) {
+				error = -EINVAL;
+				goto out;
 			}
-		}
-		read_unlock(&tasklist_lock);
-		if (!process) {
-			error = -EINVAL;
-			goto out;
 		}
 	} else {
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-29  8:55     ` Evgeniy Polyakov
@ 2006-12-29 12:54       ` Ingo Molnar
  2006-12-29 13:14         ` Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Ingo Molnar @ 2006-12-29 12:54 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik, Jamal Hadi Salim


* Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> On Thu, Dec 28, 2006 at 05:01:37PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> > 
> > * Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:
> > 
> > > Generic event handling mechanism.
> > 
> > i see it covers alot of event sources, but i cannot see block IO 
> > notifications. Am i missing something?
> 
> Depending on what it is :) If you mean kevent based AIO, then it was 
> dropped to reduce size of the patchset, and in favour of new AIO 
> design.

yes, kevent based AIO. Could you please re-add it, preferably ontop of 
Suparna's AIO patchset? I dont see how a "generic event handling 
mechanism" can exclude block IO because we really need to see how it 
plugs into (and plays along with) block AIO and how it performs relative 
to block AIO to be able to judge whether this API and infrastructure 
should be included in the kernel in its current form.

> Other kinds of read/write notifications can be handled by poll/select 
> notifications.

but poll/select notifications are just a second-degree way of doing an 
IO state machine, and they are mostly there in kevents for completeness 
and compatibility.

To be able to judge a "generic" event mechanism it really must support 
block IO as well, natively. Without that we'd have the following obscene 
API situation:

 - poll()/select(): supports everything but is slow and inaccurate
 - epoll(): more modern API ontop of poll notifications
 - async IO: supports block IO
 - kevent supports almost everything /except/ block IO

so what we need is for kevents to support /all/ the important 
high-performance event types natively:

 - networking
 - block IO
 - VFS namespace
 - timers

(rarer things like mouse/input events can stay with poll notifications)

and it is /especially/ important to include block IO events in kevents 
to be able to judge its performance and scalability relative to the 
async IO API and infrastructure.

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-29 12:54       ` Ingo Molnar
@ 2006-12-29 13:14         ` Evgeniy Polyakov
  2006-12-29 13:24           ` Ingo Molnar
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2006-12-29 13:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik, Jamal Hadi Salim

On Fri, Dec 29, 2006 at 01:54:27PM +0100, Ingo Molnar (mingo@elte.hu) wrote:
> > > > Generic event handling mechanism.
> > > 
> > > i see it covers alot of event sources, but i cannot see block IO 
> > > notifications. Am i missing something?
> > 
> > Depending on what it is :) If you mean kevent based AIO, then it was 
> > dropped to reduce size of the patchset, and in favour of new AIO 
> > design.
> 
> yes, kevent based AIO. Could you please re-add it, preferably ontop of 
> Suparna's AIO patchset? I dont see how a "generic event handling 
> mechanism" can exclude block IO because we really need to see how it 
> plugs into (and plays along with) block AIO and how it performs relative 
> to block AIO to be able to judge whether this API and infrastructure 
> should be included in the kernel in its current form.

I like new design much more than my previous kevent based approach and
existing repeated call approach. I plan to start working on it jst after
New Year vacations are over (in about a week or two, it is the longest
vacations of the year in Russia, which are spent in a way which does not 
allow to hack or perform any other usefull work).
Kevent AIO was completely different thing than Suparna's AIO, and
although it hooked into block/fs subsystem on a bit different layer (I
exported ->get_block() callback), it was possible to fully separate AIO
from main code.

> > Other kinds of read/write notifications can be handled by poll/select 
> > notifications.
> 
> but poll/select notifications are just a second-degree way of doing an 
> IO state machine, and they are mostly there in kevents for completeness 
> and compatibility.

Yes, indeed.

> To be able to judge a "generic" event mechanism it really must support 
> block IO as well, natively. Without that we'd have the following obscene 
> API situation:
> 
>  - poll()/select(): supports everything but is slow and inaccurate
>  - epoll(): more modern API ontop of poll notifications
>  - async IO: supports block IO

Network AIO should not be different from block IO - it is essentially
the same mechanisms, which just have different lower layer from where
callbacks are invoked. 

>  - kevent supports almost everything /except/ block IO
> 
> so what we need is for kevents to support /all/ the important 
> high-performance event types natively:
> 
>  - networking
>  - block IO
>  - VFS namespace
>  - timers
> 
> (rarer things like mouse/input events can stay with poll notifications)
> 
> and it is /especially/ important to include block IO events in kevents 
> to be able to judge its performance and scalability relative to the 
> async IO API and infrastructure.

Yes, async IO is a significant part, and will be implemented, IMHO, new
design I highlighted in linux-fsdevel@ in AIO related thread is the way
to go (at least I will imlement it that way).

> 	Ingo

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [take29 0/8] kevent: Generic event handling mechanism.
  2006-12-29 13:14         ` Evgeniy Polyakov
@ 2006-12-29 13:24           ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2006-12-29 13:24 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, netdev, Zach Brown,
	Christoph Hellwig, Chase Venters, Johann Borck, linux-kernel,
	Jeff Garzik, Jamal Hadi Salim


* Evgeniy Polyakov <johnpol@2ka.mipt.ru> wrote:

> > (rarer things like mouse/input events can stay with poll 
> > notifications)
> > 
> > and it is /especially/ important to include block IO events in 
> > kevents to be able to judge its performance and scalability relative 
> > to the async IO API and infrastructure.
> 
> Yes, async IO is a significant part, and will be implemented, IMHO, 
> new design I highlighted in linux-fsdevel@ in AIO related thread is 
> the way to go (at least I will imlement it that way).

yes. Note that a prototype exists already: take a look at Tux's "work 
atom" infrastructure of how you can build a relatively straightforward 
state-machine that can be programmed and can be driven even from IRQ 
contexts. Via that i implemented fully asynchronous IO for networking 5 
years ago, and programmed it to handle HTTP and FTP protocol server 
logic, fully asynchronously. (For block IO it also does emulation of 
event handling via the 'cachemiss' kernel threads. State-machine driven 
filesystems are quite hard - but not impossible in the long run.)

It would be a natural thing to extend that fundamental concept to 
user-space as well. /That/ i'd call a generic, grounds-up event handling 
infrastructure. That would be a worthwile unification point for all 
existing IO APIs.

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 7/10] kevent: Signal notifications.
  2007-01-08 19:25             ` [take31 6/10] kevent: Pipe notifications Evgeniy Polyakov
@ 2007-01-08 19:25               ` Evgeniy Polyakov
  2007-01-08 19:26                 ` [take31 8/10] kevent: Kevent posix timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Signal notifications.

This type of notifications allows to deliver signals through kevent queue.
One can find example application signal.c on project homepage.

If KEVENT_SIGNAL_NOMASK bit is set in raw_u64 id then signal will be
delivered only through queue, otherwise both delivery types are used - old
through update of mask of pending signals and through queue.

If signal is delivered only through kevent queue mask of pending signals
is not updated at all, which is equal to putting signal into blocked mask,
but with delivery of that signal through kevent queue.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4463735..e7372f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -82,6 +82,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
 #include <linux/task_io_accounting.h>
 
 #include <asm/processor.h>
@@ -1048,6 +1049,10 @@ struct task_struct {
 #ifdef	CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info *delays;
 #endif
+#ifdef CONFIG_KEVENT_SIGNAL
+	struct kevent_storage st;
+	u32 kevent_signals;
+#endif
 #ifdef CONFIG_FAULT_INJECTION
 	int make_it_fail;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index fc723e5..fd7c749 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
+#include <linux/kevent.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -118,6 +119,9 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_fini(&tsk->st);
+#endif
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -1126,6 +1130,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
 
+#ifdef CONFIG_KEVENT_SIGNAL
+	kevent_storage_init(p, &p->st);
+#endif
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/kevent/kevent_signal.c b/kernel/kevent/kevent_signal.c
new file mode 100644
index 0000000..abe3972
--- /dev/null
+++ b/kernel/kevent/kevent_signal.c
@@ -0,0 +1,94 @@
+/*
+ * 	kevent_signal.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+
+static int kevent_signal_callback(struct kevent *k)
+{
+	struct task_struct *tsk = k->st->origin;
+	int sig = k->event.id.raw[0];
+	int ret = 0;
+
+	if (sig == tsk->kevent_signals)
+		ret = 1;
+
+	if (ret && (k->event.id.raw_u64 & KEVENT_SIGNAL_NOMASK))
+		tsk->kevent_signals |= 0x80000000;
+
+	return ret;
+}
+
+int kevent_signal_enqueue(struct kevent *k)
+{
+	int err;
+
+	err = kevent_storage_enqueue(&current->st, k);
+	if (err)
+		goto err_out_exit;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_exit:
+	return err;
+}
+
+int kevent_signal_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+	return 0;
+}
+
+int kevent_signal_notify(struct task_struct *tsk, int sig)
+{
+	tsk->kevent_signals = sig;
+	kevent_storage_ready(&tsk->st, NULL, KEVENT_SIGNAL_DELIVERY);
+	return (tsk->kevent_signals & 0x80000000);
+}
+
+static int __init kevent_init_signal(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_signal_callback,
+		.enqueue = &kevent_signal_enqueue,
+		.dequeue = &kevent_signal_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SIGNAL);
+}
+module_init(kevent_init_signal);
diff --git a/kernel/signal.c b/kernel/signal.c
index 5630255..f12ebc0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/kevent.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
@@ -714,6 +715,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 {
 	struct sigqueue * q = NULL;
 	int ret = 0;
+	
+	if (kevent_signal_notify(t, sig))
+		return 1;
 
 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
@@ -793,6 +797,17 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	ret = send_signal(sig, info, t, &t->pending);
 	if (!ret && !sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 out:
 	return ret;
 }
@@ -982,6 +997,17 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	 * to avoid several races.
 	 */
 	ret = send_signal(sig, info, p, &p->signal->shared_pending);
+#ifdef CONFIG_KEVENT_SIGNAL
+	/*
+	 * Kevent allows to deliver signals through kevent queue, 
+	 * it is possible to setup kevent to not deliver
+	 * signal through the usual way, in that case send_signal()
+	 * returns 1 and signal is delivered only through kevent queue.
+	 * We simulate successfull delivery notification through this hack:
+	 */
+	if (ret == 1)
+		ret = 0;
+#endif
 	if (unlikely(ret))
 		return ret;
 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 1/10] kevent: Description.
  2007-01-08 19:25 ` [take31 0/10] kevent: Generic event handling mechanism Evgeniy Polyakov
@ 2007-01-08 19:25   ` Evgeniy Polyakov
  2007-01-08 19:25     ` [take31 2/10] kevent: Core files Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar, linux-fsdevel


Description.


diff --git a/Documentation/kevent.txt b/Documentation/kevent.txt
new file mode 100644
index 0000000..95cb36e
--- /dev/null
+++ b/Documentation/kevent.txt
@@ -0,0 +1,244 @@
+Description.
+
+int kevent_init(struct kevent_ring *ring, unsigned int ring_size, 
+	unsigned int flags);
+
+num - size of the ring buffer in events 
+ring - pointer to allocated ring buffer
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value: kevent control file descriptor or negative error value.
+
+ struct kevent_ring
+ {
+   unsigned int ring_kidx, ring_over;
+   struct ukevent event[0];
+ }
+
+ring_kidx - index in the ring buffer where kernel will put new events 
+		when kevent_wait() or kevent_get_events() is called 
+ring_over - number of overflows of ring_uidx happend from the start.
+	Overflow counter is used to prevent situation when two threads 
+	are going to free the same events, but one of them was scheduled 
+	away for too long, so ring indexes were wrapped, so when that 
+	thread will be awakened, it will free not those events, which 
+	it suppose to free.
+
+Example userspace code (ring_buffer.c) can be found on project's homepage.
+
+Each kevent syscall can be so called cancellation point in glibc, i.e. when 
+thread has been cancelled in kevent syscall, thread can be safely removed 
+and no events will be lost, since each syscall (kevent_wait() or 
+kevent_get_events()) will copy event into special ring buffer, accessible 
+from other threads or even processes (if shared memory is used).
+
+When kevent is removed (not dequeued when it is ready, but just removed), 
+even if it was ready, it is not copied into ring buffer, since if it is 
+removed, no one cares about it (otherwise user would wait until it becomes 
+ready and got it through usual way using kevent_get_events() or kevent_wait()) 
+and thus no need to copy it to the ring buffer.
+
+-------------------------------------------------------------------------------
+
+
+int kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent *arg);
+
+fd - is the file descriptor referring to the kevent queue to manipulate. 
+It is created by opening "/dev/kevent" char device, which is created with 
+dynamic minor number and major number assigned for misc devices. 
+
+cmd - is the requested operation. It can be one of the following:
+    KEVENT_CTL_ADD - add event notification 
+    KEVENT_CTL_REMOVE - remove event notification 
+    KEVENT_CTL_MODIFY - modify existing notification 
+    KEVENT_CTL_READY - mark existing events as ready, if number of events is zero,
+    	it just wakes up parked in syscall thread
+
+num - number of struct ukevent in the array pointed to by arg 
+arg - array of struct ukevent
+
+Return value: 
+ number of events processed or negative error value.
+
+When called, kevent_ctl will carry out the operation specified in the 
+cmd parameter.
+-------------------------------------------------------------------------------
+
+ int kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, 
+ 		struct timespec timeout, struct ukevent *buf, unsigned flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+min_nr - minimum number of completed events that kevent_get_events will block 
+	 waiting for 
+max_nr - number of struct ukevent in buf 
+timeout - time to wait before returning less than min_nr 
+	  events. If this is -1, then wait forever. 
+buf - pointer to an array of struct ukevent. 
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied or negative error value.
+
+kevent_get_events will wait timeout milliseconds for at least min_nr completed 
+events, copying completed struct ukevents to buf and deleting any 
+KEVENT_REQ_ONESHOT event requests. In nonblocking mode it returns as many 
+events as possible, but not more than max_nr. In blocking mode it waits until 
+timeout or if at least min_nr events are ready.
+
+This function copies event into ring buffer if it was initialized, if ring buffer
+is full, KEVENT_RET_COPY_FAILED flag is set in ret_flags field.
+-------------------------------------------------------------------------------
+
+ int kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+ 	struct timespec timeout, unsigned int flags);
+
+ctl_fd - file descriptor referring to the kevent queue 
+num - number of processed kevents 
+old_uidx - the last index user is aware of
+timeout - time to wait until there is free space in kevent queue
+flags - various flags, see KEVENT_FLAGS_* definitions.
+
+Return value:
+ number of events copied into ring buffer or negative error value.
+
+This syscall waits until either timeout expires or at least one event becomes 
+ready. It also copies events into special ring buffer. If ring buffer is full,
+it waits until there are ready events and then return.
+If kevent is one-shot kevent it is removed in this syscall.
+If kevent is edge-triggered (KEVENT_REQ_ET flag is set in 'req_flags') it is 
+requeued in this syscall for performance reasons.
+-------------------------------------------------------------------------------
+
+ int kevent_commit(int ctl_fd, unsigned int new_idx, unsigned int over);
+
+ctl_fd - file descriptor referring to the kevent queue 
+new_uidx - the last committed kevent
+over - overflow count for given $new_idx value
+
+Return value:
+ number of committed kevents or negative error value.
+
+This function commits, i.e. marks as empty, slots in the ring buffer, so
+they can be reused when userspace completes that entries processing.
+
+Overflow counter is used to prevent situation when two threads are going 
+to free the same events, but one of them was scheduled away for too long, 
+so ring indexes were wrapped, so when that thread will be awakened, it 
+will free not those events, which it suppose to free.
+
+It is possible that returned number of committed events will be smaller than
+requested number - it is possible when several threads try to commit the
+same events.
+-------------------------------------------------------------------------------
+
+The bulk of the interface is entirely done through the ukevent struct. 
+It is used to add event requests, modify existing event requests, 
+specify which event requests to remove, and return completed events.
+
+struct ukevent contains the following members:
+
+struct kevent_id id
+    Id of this request, e.g. socket number, file descriptor and so on 
+__u32 type
+    Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on 
+__u32 event
+    Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED 
+__u32 req_flags
+    Per-event request flags,
+
+    KEVENT_REQ_ONESHOT
+        event will be removed when it is ready 
+
+    KEVENT_REQ_WAKEUP_ALL
+        Kevent wakes up only first thread interested in given event, 
+	or all threads if this flag is set.
+
+    KEVENT_REQ_ET
+        Edge Triggered behaviour. It is an optimisation which allows to move 
+	ready and dequeued (i.e. copied to userspace) event to move into set 
+	of interest for given storage (socket, inode and so on) again. It is 
+	very usefull for cases when the same event should be used many times 
+	(like reading from pipe). It is similar to epoll()'s EPOLLET flag. 
+
+    KEVENT_REQ_LAST_CHECK
+        if set allows to perform the last check on kevent (call appropriate 
+	callback) when kevent is marked as ready and has been removed from 
+	ready queue. If it will be confirmed that kevent is ready 
+	(k->callbacks.callback(k) returns true) then kevent will be copied 
+	to userspace, otherwise it will be requeued back to storage. 
+	Second (checking) call is performed with this bit cleared, so callback 
+	can detect when it was called from kevent_storage_ready() - bit is set, 
+	or kevent_dequeue_ready() - bit is cleared. If kevent will be requeued, 
+	bit will be set again.
+
+   KEVENT_REQ_ALWAYS_QUEUE
+        If this flag is set kevent will be queued into ready queue if it is 
+	ready at enqueue time, otherwise it will be copied back to userspace
+	and will not be queued into the storage.
+
+   KEVENT_REQ_READY
+   	If this flag is set, kevent will be marked as ready immediately at enqueue
+	time.
+
+__u32 ret_flags
+    Per-event return flags
+
+    KEVENT_RET_BROKEN
+        Kevent is broken 
+
+    KEVENT_RET_DONE
+        Kevent processing was finished successfully 
+
+    KEVENT_RET_COPY_FAILED
+        Kevent was not copied into ring buffer due to some error conditions. 
+
+__u32 ret_data
+    Event return data. Event originator fills it with anything it likes 
+    (for example timer notifications put number of milliseconds when timer 
+    has fired 
+union { __u32 user[2]; void *ptr; }
+    User's data. It is not used, just copied to/from user. The whole structure 
+    is aligned to 8 bytes already, so the last union is aligned properly. 
+
+-------------------------------------------------------------------------------
+
+Kevent waiting syscall flags.
+
+KEVENT_FLAGS_ABSTIME - provided timespec parameter contains absolute time, 
+	for example Aug 27, 2194, or time(NULL) + 10.
+
+-------------------------------------------------------------------------------
+
+Usage
+
+For KEVENT_CTL_ADD, all fields relevant to the event type must be filled 
+(id, type, event, req_flags). 
+After kevent_ctl(..., KEVENT_CTL_ADD, ...) returns each struct's ret_flags 
+should be checked to see if the event is already broken or done.
+
+For KEVENT_CTL_MODIFY, the id, req_flags, and user and event fields must be 
+set and an existing kevent request must have matching id and user fields. If 
+match is found, req_flags and event are replaced with the newly supplied 
+values and requeueing is started, so modified kevent can be checked and 
+probably marked as ready immediately. If a match can't be found, the 
+passed in ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is 
+always set.
+
+For KEVENT_CTL_REMOVE, the id and user fields must be set and an existing 
+kevent request must have matching id and user fields. If a match is found, 
+the kevent request is removed. If a match can't be found, the passed in 
+ukevent's ret_flags has KEVENT_RET_BROKEN set. KEVENT_RET_DONE is always set.
+
+For kevent_get_events, the entire structure is returned.
+
+-------------------------------------------------------------------------------
+
+Usage cases
+
+kevent_timer
+struct ukevent should contain following fields:
+    type - KEVENT_TIMER 
+    event - KEVENT_TIMER_FIRED 
+    req_flags - KEVENT_REQ_ONESHOT if you want to fire that timer only once 
+    id.raw[0] - number of seconds after commit when this timer shout expire 
+    id.raw[0] - additional to number of seconds number of nanoseconds 


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 2/10] kevent: Core files.
  2007-01-08 19:25   ` [take31 1/10] kevent: Description Evgeniy Polyakov
@ 2007-01-08 19:25     ` Evgeniy Polyakov
  2007-01-08 19:25       ` [take31 3/10] kevent: poll/select() notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar, linux-fsdevel


Core files.

This patch includes core kevent files:
 * userspace controlling
 * kernelspace interfaces
 * initialization
 * notification state machines

Some bits of documentation can be found on project's homepage (and links from there):
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 2697e92..769137e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -319,3 +319,9 @@ ENTRY(sys_call_table)
 	.long sys_move_pages
 	.long sys_getcpu
 	.long sys_epoll_pwait
+	.long sys_kevent_get_events
+	.long sys_kevent_ctl		/* 320 */
+	.long sys_kevent_wait
+	.long sys_kevent_commit
+	.long sys_kevent_init
+	.lond sys_aio_sendfile
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index b4aa875..d8c9ac9 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -714,8 +714,14 @@ ia32_sys_call_table:
 	.quad compat_sys_get_robust_list
 	.quad sys_splice
 	.quad sys_sync_file_range
-	.quad sys_tee
+	.quad sys_tee			/* 315 */
 	.quad compat_sys_vmsplice
 	.quad compat_sys_move_pages
 	.quad sys_getcpu
+	.quad sys_kevent_get_events
+	.quad sys_kevent_ctl		/* 320 */
+	.quad sys_kevent_wait
+	.quad sys_kevent_commit
+	.quad sys_kevent_init
+	.quad sys_aio_sendfile
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 833fa17..81429ea 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -325,10 +325,16 @@
 #define __NR_move_pages		317
 #define __NR_getcpu		318
 #define __NR_epoll_pwait	319
+#define __NR_kevent_get_events	320
+#define __NR_kevent_ctl		321
+#define __NR_kevent_wait	322
+#define __NR_kevent_commit	323
+#define __NR_kevent_init	324
+#define __NR_aio_sendfile	325
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 320
+#define NR_syscalls 326
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index c5f596e..6d2f37e 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,8 +619,20 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages		279
 __SYSCALL(__NR_move_pages, sys_move_pages)
-
-#define __NR_syscall_max __NR_move_pages
+#define __NR_kevent_get_events	280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl		281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait	282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
+#define __NR_kevent_commit	283
+__SYSCALL(__NR_kevent_commit, sys_kevent_commit)
+#define __NR_kevent_init	284
+__SYSCALL(__NR_kevent_init, sys_kevent_init)
+#define __NR_aio_sendfile	285
+__SYSCALL(__NR_aio_sendfile, sys_aio_sendfile)
+
+#define __NR_syscall_max __NR_aio_sendfile
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 0000000..3040d01
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,268 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/hrtimer.h>
+#include <linux/kevent_storage.h>
+#include <linux/ukevent.h>
+
+#define KEVENT_MIN_BUFFS_ALLOC	3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+/* @flags - flags for given set of callbacks. */
+
+struct kevent_callbacks {
+	kevent_callback_t	callback, enqueue, dequeue;
+	unsigned int		flags;
+};
+
+#define KEVENT_CALLBACKS_KERNELONLY	0x1
+
+int kevent_event_is_allowed(struct ukevent *e);
+
+#define KEVENT_READY		0x1
+#define KEVENT_STORAGE		0x2
+#define KEVENT_USER		0x4
+
+struct kevent
+{
+	/* Used for kevent freeing.*/
+	struct rcu_head		rcu_head;
+	struct ukevent		event;
+	/* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+	spinlock_t		ulock;
+
+	/* Entry of user's tree. */
+	struct rb_node		kevent_node;
+	/* Entry of origin's queue. */
+	struct list_head	storage_entry;
+	/* Entry of user's ready. */
+	struct list_head	ready_entry;
+
+	u32			flags;
+
+	/* User who requested this kevent. */
+	struct kevent_user	*user;
+	/* Kevent container. */
+	struct kevent_storage	*st;
+
+	struct kevent_callbacks	callbacks;
+
+	/* Private data for different storages.
+	 * poll()/select storage has a list of wait_queue_t containers
+	 * for each ->poll() { poll_wait()' } here.
+	 */
+	void			*priv;
+};
+
+struct kevent_user
+{
+	struct rb_root		kevent_root;
+	spinlock_t		kevent_lock;
+	/* Number of queued kevents. */
+	unsigned int		kevent_num;
+
+	/* List of ready kevents. */
+	struct list_head	ready_list;
+	/* Number of ready kevents. */
+	unsigned int		ready_num;
+	/* Protects all manipulations with ready queue. */
+	spinlock_t 		ready_lock;
+
+	/* Protects against simultaneous kevent_user control manipulations. */
+	struct mutex		ctl_mutex;
+	/* Wait until some events are ready. */
+	wait_queue_head_t	wait;
+	/* Exit from syscall if someone wants us to do it */
+	int			need_exit;
+
+	/* Reference counter, increased for each new kevent. */
+	atomic_t		refcnt;
+
+	/* Mutex protecting userspace ring buffer. */
+	struct mutex		ring_lock;
+	/* Kernel index and size of the userspace ring buffer. */
+	unsigned int		kidx, uidx, ring_size, ring_over, full;
+	/* Pointer to userspace ring buffer. */
+	struct kevent_ring __user *pring;
+
+	/* Is used for absolute waiting times. */
+	struct hrtimer		timer;
+
+	/* Used for userspace private notifications. */
+	struct kevent_storage	st;
+
+#ifdef CONFIG_KEVENT_USER_STAT
+	unsigned long		im_num;
+	unsigned long		wait_num, ring_num;
+	unsigned long		total;
+#endif
+};
+
+int kevent_enqueue(struct kevent *k);
+int kevent_dequeue(struct kevent *k);
+int kevent_init(struct kevent *k);
+void kevent_requeue(struct kevent *k);
+int kevent_break(struct kevent *k);
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos);
+
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event);
+int kevent_storage_init(void *origin, struct kevent_storage *st);
+void kevent_storage_fini(struct kevent_storage *st);
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k);
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k);
+
+void kevent_ready(struct kevent *k, int ret);
+
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u);
+
+#ifdef CONFIG_KEVENT_POLL
+void kevent_poll_reinit(struct file *file);
+#else
+static inline void kevent_poll_reinit(struct file *file)
+{
+}
+#endif
+
+#ifdef CONFIG_KEVENT_USER_STAT
+static inline void kevent_stat_init(struct kevent_user *u)
+{
+	u->wait_num = u->im_num = u->total = u->ring_num = 0;
+}
+static inline void kevent_stat_print(struct kevent_user *u)
+{
+	printk(KERN_INFO "%s: u: %p, wait: %lu, ring: %lu, immediately: %lu, total: %lu.\n",
+			__func__, u, u->wait_num, u->ring_num, u->im_num, u->total);
+}
+static inline void kevent_stat_im(struct kevent_user *u)
+{
+	u->im_num++;
+}
+static inline void kevent_stat_ring(struct kevent_user *u)
+{
+	u->ring_num++;
+}
+static inline void kevent_stat_wait(struct kevent_user *u)
+{
+	u->wait_num++;
+}
+static inline void kevent_stat_total(struct kevent_user *u)
+{
+	u->total++;
+}
+#else
+#define kevent_stat_print(u)		({ (void) u;})
+#define kevent_stat_init(u)		({ (void) u;})
+#define kevent_stat_im(u)		({ (void) u;})
+#define kevent_stat_wait(u)		({ (void) u;})
+#define kevent_stat_ring(u)		({ (void) u;})
+#define kevent_stat_total(u)		({ (void) u;})
+#endif
+
+void kevent_user_free(struct kevent_user *u);
+
+/*
+ * Kevent userspace control block reference counting.
+ * Set to 1 at creation time, when appropriate kevent file descriptor
+ * is closed, that reference counter is decreased.
+ * When counter hits zero block is freed.
+ */
+static inline void kevent_user_get(struct kevent_user *u)
+{
+	atomic_inc(&u->refcnt);
+}
+
+static inline void kevent_user_put(struct kevent_user *u)
+{
+	if (atomic_dec_and_test(&u->refcnt)) {
+		kevent_user_free(u);
+
+	}
+}
+
+#ifdef CONFIG_LOCKDEP
+void kevent_socket_reinit(struct socket *sock);
+void kevent_sk_reinit(struct sock *sk);
+#else
+static inline void kevent_socket_reinit(struct socket *sock)
+{
+}
+static inline void kevent_sk_reinit(struct sock *sk)
+{
+}
+#endif
+#ifdef CONFIG_KEVENT_SOCKET
+void kevent_socket_notify(struct sock *sock, u32 event);
+int kevent_socket_dequeue(struct kevent *k);
+int kevent_socket_enqueue(struct kevent *k);
+#define sock_async(__sk) sock_flag(__sk, SOCK_ASYNC)
+#else
+static inline void kevent_socket_notify(struct sock *sock, u32 event)
+{
+}
+#define sock_async(__sk)	({ (void)__sk; 0; })
+#endif
+
+#ifdef CONFIG_KEVENT_POLL
+static inline void kevent_init_file(struct file *file)
+{
+	kevent_storage_init(file, &file->st);
+}
+
+static inline void kevent_cleanup_file(struct file *file)
+{
+	kevent_storage_fini(&file->st);
+}
+#else
+static inline void kevent_init_file(struct file *file) {}
+static inline void kevent_cleanup_file(struct file *file) {}
+#endif
+
+#ifdef CONFIG_KEVENT_PIPE
+extern void kevent_pipe_notify(struct inode *inode, u32 events);
+#else
+static inline void kevent_pipe_notify(struct inode *inode, u32 events) {}
+#endif
+
+#ifdef CONFIG_KEVENT_SIGNAL
+extern int kevent_signal_notify(struct task_struct *tsk, int sig);
+#else
+static inline int kevent_signal_notify(struct task_struct *tsk, int sig) {return 0;}
+#endif
+
+#endif /* __KEVENT_H */
diff --git a/include/linux/kevent_storage.h b/include/linux/kevent_storage.h
new file mode 100644
index 0000000..a38575d
--- /dev/null
+++ b/include/linux/kevent_storage.h
@@ -0,0 +1,11 @@
+#ifndef __KEVENT_STORAGE_H
+#define __KEVENT_STORAGE_H
+
+struct kevent_storage
+{
+	void			*origin;		/* Originator's pointer, e.g. struct sock or struct file. Can be NULL. */
+	struct list_head	list;			/* List of queued kevents. */
+	spinlock_t		lock;			/* Protects users queue. */
+};
+
+#endif /* __KEVENT_STORAGE_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6c..2251aa2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,8 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct ukevent;
+struct kevent_ring;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -603,6 +605,16 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 				    size_t len);
 asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
 
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min, unsigned int max,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags);
+asmlinkage long sys_kevent_ctl(int ctl_fd, unsigned int cmd, unsigned int num, struct ukevent __user *buf);
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags);
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over);
+asmlinkage long sys_kevent_init(int ctl_fd, struct kevent_ring __user *ring, unsigned int num, unsigned int flags);
+
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage long sys_aio_sendfile(int kevent_fd, int sock_fd, int in_fd, off_t offset, size_t count);
+
 #endif
diff --git a/include/linux/ukevent.h b/include/linux/ukevent.h
new file mode 100644
index 0000000..d975407
--- /dev/null
+++ b/include/linux/ukevent.h
@@ -0,0 +1,191 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __UKEVENT_H
+#define __UKEVENT_H
+
+#include <linux/types.h>
+
+/*
+ * Kevent request flags.
+ */
+
+/* Process this event only once and then remove it. */
+#define KEVENT_REQ_ONESHOT	0x1
+/* Kevent wakes up only first thread interested in given event,
+ * or all threads if this flag is set.
+ */
+#define KEVENT_REQ_WAKEUP_ALL	0x2
+/* Edge Triggered behaviour. */
+#define KEVENT_REQ_ET		0x4
+/* Perform the last check on kevent (call appropriate callback) when
+ * kevent is marked as ready and has been removed from ready queue.
+ * If it will be confirmed that kevent is ready 
+ * (k->callbacks.callback(k) returns true) then kevent will be copied
+ * to userspace, otherwise it will be requeued back to storage. 
+ * Second (checking) call is performed with this bit _cleared_ so
+ * callback can detect when it was called from 
+ * kevent_storage_ready() - bit is set, or 
+ * kevent_dequeue_ready() - bit is cleared. 
+ * If kevent will be requeued, bit will be set again. */
+#define KEVENT_REQ_LAST_CHECK	0x8
+/*
+ * Always queue kevent even if it is immediately ready.
+ */
+#define KEVENT_REQ_ALWAYS_QUEUE	0x10
+/*
+ * Mark event as ready immediately on enqueueing time.
+ */
+#define KEVENT_REQ_READY	0x20
+
+/*
+ * Kevent return flags.
+ */
+/* Kevent is broken. */
+#define KEVENT_RET_BROKEN	0x1
+/* Kevent processing was finished successfully. */
+#define KEVENT_RET_DONE		0x2
+/* Kevent was not copied into ring buffer due to some error conditions. */
+#define KEVENT_RET_COPY_FAILED	0x4
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET 		0
+#define KEVENT_INODE		1
+#define KEVENT_TIMER		2
+#define KEVENT_POLL		3
+#define KEVENT_NAIO		4
+#define KEVENT_AIO		5
+#define KEVENT_PIPE		6
+#define KEVENT_SIGNAL		7
+#define KEVENT_POSIX_TIMER	8
+#define KEVENT_UNOTIFY		9
+
+/* Used as array size, so must be equal to the last KEVENT type + 1 */
+#define	KEVENT_MAX		10
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#define	KEVENT_TIMER_FIRED	0x1
+
+/*
+ * Socket/network asynchronous IO and PIPE events.
+ */
+#define	KEVENT_SOCKET_RECV	0x1
+#define	KEVENT_SOCKET_ACCEPT	0x2
+#define	KEVENT_SOCKET_SEND	0x4
+
+/*
+ * Inode events.
+ */
+#define	KEVENT_INODE_CREATE	0x1
+#define	KEVENT_INODE_REMOVE	0x2
+
+/*
+ * Poll events.
+ */
+#define	KEVENT_POLL_POLLIN	0x0001
+#define	KEVENT_POLL_POLLPRI	0x0002
+#define	KEVENT_POLL_POLLOUT	0x0004
+#define	KEVENT_POLL_POLLERR	0x0008
+#define	KEVENT_POLL_POLLHUP	0x0010
+#define	KEVENT_POLL_POLLNVAL	0x0020
+
+#define	KEVENT_POLL_POLLRDNORM	0x0040
+#define	KEVENT_POLL_POLLRDBAND	0x0080
+#define	KEVENT_POLL_POLLWRNORM	0x0100
+#define	KEVENT_POLL_POLLWRBAND	0x0200
+#define	KEVENT_POLL_POLLMSG	0x0400
+#define	KEVENT_POLL_POLLREMOVE	0x1000
+#define KEVENT_POLL_POLLRDHUP	0x2000
+
+/*
+ * Asynchronous IO events.
+ */
+#define	KEVENT_AIO_BIO		0x1
+
+/*
+ * Signal events.
+ */
+#define KEVENT_SIGNAL_DELIVERY		0x1
+
+/* If set in raw64, then given signals will not be delivered
+ * in a usual way through sigmask update and signal callback 
+ * invocation. */
+#define KEVENT_SIGNAL_NOMASK	0x8000000000000000ULL
+
+/* Mask of all possible event values. */
+#define KEVENT_MASK_ALL		0xffffffff
+/* Empty mask of ready events. */
+#define KEVENT_MASK_EMPTY	0x0
+
+struct kevent_id
+{
+	union {
+		__u32		raw[2];
+		__u64		raw_u64 __attribute__((aligned(8)));
+	};
+};
+
+struct ukevent
+{
+	/* Id of this request, e.g. socket number, file descriptor and so on... */
+	struct kevent_id	id;
+	/* Event type, e.g. KEVENT_SOCK, KEVENT_INODE, KEVENT_TIMER and so on... */
+	__u32			type;
+	/* Event itself, e.g. SOCK_ACCEPT, INODE_CREATED, TIMER_FIRED... */
+	__u32			event;
+	/* Per-event request flags */
+	__u32			req_flags;
+	/* Per-event return flags */
+	__u32			ret_flags;
+	/* Event return data. Event originator fills it with anything it likes. */
+	__u32			ret_data[2];
+	/* User's data. It is not used, just copied to/from user.
+	 * The whole structure is aligned to 8 bytes already, so the last union
+	 * is aligned properly.
+	 */
+	union {
+		__u32		user[2];
+		void		*ptr;
+	};
+};
+
+struct kevent_ring
+{
+	unsigned int		ring_kidx, ring_over;
+	struct ukevent		event[0];
+};
+
+#define	KEVENT_CTL_ADD 		0
+#define	KEVENT_CTL_REMOVE	1
+#define	KEVENT_CTL_MODIFY	2
+#define	KEVENT_CTL_READY	3
+
+/* Provided timespec parameter uses absolute time, i.e. 'wait until Aug 27, 2194' */
+#define KEVENT_FLAGS_ABSTIME	1
+
+#endif /* __UKEVENT_H */
diff --git a/init/Kconfig b/init/Kconfig
index a3f83e2..561a0fa 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -219,6 +219,8 @@ config AUDITSYSCALL
 	  such as SELinux.  To use audit's filesystem watch feature, please
 	  ensure that INOTIFY is configured.
 
+source "kernel/kevent/Kconfig"
+
 config IKCONFIG
 	tristate "Kernel .config support"
 	---help---
diff --git a/kernel/Makefile b/kernel/Makefile
index 14f4d45..6bccdbb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_KEVENT) += kevent/
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/kevent/Kconfig b/kernel/kevent/Kconfig
new file mode 100644
index 0000000..a23e84f
--- /dev/null
+++ b/kernel/kevent/Kconfig
@@ -0,0 +1,82 @@
+config KEVENT
+	bool "Kernel event notification mechanism" if EMBEDDED
+	default y
+	help
+	  This option enables event queue mechanism.
+	  It can be used as replacement for poll()/select(), AIO callback
+	  invocations, advanced timer notifications and other kernel
+	  object status changes.
+
+config KEVENT_USER_STAT
+	bool "Kevent user statistic"
+	depends on KEVENT
+	help
+	  This option will turn kevent_user statistic collection on.
+	  Statistic data includes total number of kevent, number of kevents
+	  which are ready immediately at insertion time and number of kevents
+	  which were removed through readiness completion.
+	  It will be printed each time control kevent descriptor is closed.
+
+config KEVENT_TIMER
+	bool "Kernel event notifications for timers"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use timers through KEVENT subsystem.
+
+config KEVENT_POLL
+	bool "Kernel event notifications for poll()/select()"
+	depends on KEVENT
+	default y
+	help
+	  This option allows to use kevent subsystem for poll()/select()
+	  notifications.
+
+config KEVENT_SOCKET
+	bool "Kernel event notifications for sockets"
+	depends on NET && KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  sockets operations, like new packet receiving conditions, 
+	  ready for accept conditions and so on.
+
+config KEVENT_PIPE
+	bool "Kernel event notifications for pipes"
+	depends on KEVENT
+	default y
+	help
+	  This option enables notifications through KEVENT subsystem of 
+	  pipe read/write operations.
+
+config KEVENT_SIGNAL
+	bool "Kernel event notifications for signals"
+	depends on KEVENT
+	default y
+	help
+	  This option enables signal delivery through KEVENT subsystem.
+	  Signals which were requested to be delivered through kevent
+	  subsystem must be registered through usual signal() and others
+	  syscalls, this option allows alternative delivery.
+	  With KEVENT_SIGNAL_NOMASK flag being set in kevent for set of 
+	  signals, they will not be delivered in a usual way.
+	  Kevents for appropriate signals are not copied when process forks,
+	  new process must add new kevents after fork(). Mask of signals
+	  is copied as before.
+
+config KEVENT_UNOTIFY
+	bool "Private userspace notifications over kevent"
+	depends on KEVENT
+	default y
+	help
+	  This option enable possibility to insert private userspace events,
+	  which can be marked as ready on demand using kevent_ctl(KEVENT_CTL_READY)
+	  command.
+
+config KEVENT_AIO
+	bool "Kevent based AIO"
+	depends on KEVENT && NET
+	default y
+	help
+	  This option allows to work with kevent based AIO state machine.
+	  Among others this allows to implement aio_sendfile() syscall.
diff --git a/kernel/kevent/Makefile b/kernel/kevent/Makefile
new file mode 100644
index 0000000..dc7f8b2
--- /dev/null
+++ b/kernel/kevent/Makefile
@@ -0,0 +1,8 @@
+obj-y := kevent.o kevent_user.o
+obj-$(CONFIG_KEVENT_TIMER) += kevent_timer.o
+obj-$(CONFIG_KEVENT_POLL) += kevent_poll.o
+obj-$(CONFIG_KEVENT_SOCKET) += kevent_socket.o
+obj-$(CONFIG_KEVENT_PIPE) += kevent_pipe.o
+obj-$(CONFIG_KEVENT_SIGNAL) += kevent_signal.o
+obj-$(CONFIG_KEVENT_UNOTIFY) += kevent_unotify.o
+obj-$(CONFIG_KEVENT_AIO) += kevent_aio.o
diff --git a/kernel/kevent/kevent.c b/kernel/kevent/kevent.c
new file mode 100644
index 0000000..743cd0c
--- /dev/null
+++ b/kernel/kevent/kevent.c
@@ -0,0 +1,267 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/kevent.h>
+
+/*
+ * Attempts to add an event into appropriate origin's queue.
+ * Returns positive value if this event is ready immediately,
+ * negative value in case of error and zero if event has been queued.
+ * ->enqueue() callback must increase origin's reference counter.
+ */
+int kevent_enqueue(struct kevent *k)
+{
+	return k->callbacks.enqueue(k);
+}
+
+/*
+ * Remove event from the appropriate queue.
+ * ->dequeue() callback must decrease origin's reference counter.
+ */
+int kevent_dequeue(struct kevent *k)
+{
+	return k->callbacks.dequeue(k);
+}
+
+/*
+ * Mark kevent as broken.
+ */
+int kevent_break(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_BROKEN;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return -EINVAL;
+}
+
+static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX] __read_mostly;
+
+int kevent_event_is_allowed(struct ukevent *e)
+{
+	if (unlikely(e->type >= KEVENT_MAX))
+		return 0;
+
+	if (!kevent_registered_callbacks[e->type].callback)
+		return 0;
+
+	if (unlikely(kevent_registered_callbacks[e->type].callback == kevent_break))
+		return 0;
+
+	if (kevent_registered_callbacks[e->type].flags & KEVENT_CALLBACKS_KERNELONLY)
+		return 0;
+	
+	return 1;
+}
+
+int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos)
+{
+	struct kevent_callbacks *p;
+
+	if (pos >= KEVENT_MAX)
+		return -EINVAL;
+
+	p = &kevent_registered_callbacks[pos];
+
+	p->enqueue = (cb->enqueue) ? cb->enqueue : kevent_break;
+	p->dequeue = (cb->dequeue) ? cb->dequeue : kevent_break;
+	p->callback = (cb->callback) ? cb->callback : kevent_break;
+	p->flags = cb->flags;
+
+	printk(KERN_INFO "KEVENT: Added callbacks for type %d.\n", pos);
+	return 0;
+}
+
+/*
+ * Must be called before event is going to be added into some origin's queue.
+ * Initializes ->enqueue(), ->dequeue() and ->callback() callbacks.
+ * If failed, kevent should not be used or kevent_enqueue() will fail to add
+ * this kevent into origin's queue with setting
+ * KEVENT_RET_BROKEN flag in kevent->event.ret_flags.
+ */
+int kevent_init(struct kevent *k)
+{
+	spin_lock_init(&k->ulock);
+	k->flags = 0;
+
+	if (unlikely(k->event.type >= KEVENT_MAX)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	if (!kevent_registered_callbacks[k->event.type].callback) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	k->callbacks = kevent_registered_callbacks[k->event.type];
+	if (unlikely(k->callbacks.callback == kevent_break)) {
+		kevent_break(k);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+/*
+ * Called from ->enqueue() callback when reference counter for given
+ * origin (socket, inode...) has been increased.
+ */
+int kevent_storage_enqueue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	k->st = st;
+	spin_lock_irqsave(&st->lock, flags);
+	list_add_tail_rcu(&k->storage_entry, &st->list);
+	k->flags |= KEVENT_STORAGE;
+	if (k->event.req_flags & KEVENT_REQ_READY)
+		kevent_ready(k, 1);
+	spin_unlock_irqrestore(&st->lock, flags);
+	return 0;
+}
+
+/*
+ * Dequeue kevent from origin's queue.
+ * It does not decrease origin's reference counter in any way
+ * and must be called before it, so storage itself must be valid.
+ * It is called from ->dequeue() callback.
+ */
+void kevent_storage_dequeue(struct kevent_storage *st, struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&st->lock, flags);
+	if (k->flags & KEVENT_STORAGE) {
+		list_del_rcu(&k->storage_entry);
+		k->flags &= ~KEVENT_STORAGE;
+	}
+	spin_unlock_irqrestore(&st->lock, flags);
+}
+
+void kevent_ready(struct kevent *k, int ret)
+{
+	unsigned long flags;
+	int rem;
+
+	spin_lock_irqsave(&k->ulock, flags);
+	if (ret > 0)
+		k->event.ret_flags |= KEVENT_RET_DONE;
+	else if (ret < 0)
+		k->event.ret_flags |= (KEVENT_RET_BROKEN | KEVENT_RET_DONE);
+	else
+		ret = (k->event.ret_flags & (KEVENT_RET_BROKEN|KEVENT_RET_DONE));
+	rem = (k->event.req_flags & KEVENT_REQ_ONESHOT);
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	if (ret) {
+		if ((rem || ret < 0) && (k->flags & KEVENT_STORAGE)) {
+			list_del_rcu(&k->storage_entry);
+			k->flags &= ~KEVENT_STORAGE;
+		}
+
+		spin_lock_irqsave(&k->user->ready_lock, flags);
+		if (!(k->flags & KEVENT_READY)) {
+			list_add_tail(&k->ready_entry, &k->user->ready_list);
+			k->flags |= KEVENT_READY;
+			k->user->ready_num++;
+		}
+		spin_unlock_irqrestore(&k->user->ready_lock, flags);
+		wake_up(&k->user->wait);
+	}
+}
+
+/*
+ * Call kevent ready callback and queue it into ready queue if needed.
+ * If kevent is marked as one-shot, then remove it from storage queue.
+ */
+static int __kevent_requeue(struct kevent *k, u32 event)
+{
+	int ret;
+
+	ret = k->callbacks.callback(k);
+
+	kevent_ready(k, ret);
+
+	return ret;
+}
+
+/*
+ * Check if kevent is ready (by invoking it's callback) and requeue/remove
+ * if needed.
+ */
+void kevent_requeue(struct kevent *k)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&k->st->lock, flags);
+	__kevent_requeue(k, 0);
+	spin_unlock_irqrestore(&k->st->lock, flags);
+}
+
+/*
+ * Called each time some activity in origin (socket, inode...) is noticed.
+ */
+void kevent_storage_ready(struct kevent_storage *st,
+		kevent_callback_t ready_callback, u32 event)
+{
+	struct kevent *k;
+	int wake_num = 0;
+
+	rcu_read_lock();
+	if (unlikely(ready_callback))
+		list_for_each_entry_rcu(k, &st->list, storage_entry)
+			(*ready_callback)(k);
+
+	list_for_each_entry_rcu(k, &st->list, storage_entry) {
+		if (event & k->event.event)
+			if ((k->event.req_flags & KEVENT_REQ_WAKEUP_ALL) || wake_num == 0)
+				if (__kevent_requeue(k, event))
+					wake_num++;
+	}
+	rcu_read_unlock();
+}
+
+int kevent_storage_init(void *origin, struct kevent_storage *st)
+{
+	spin_lock_init(&st->lock);
+	st->origin = origin;
+	INIT_LIST_HEAD(&st->list);
+	return 0;
+}
+
+/*
+ * Mark all events as broken, that will remove them from storage,
+ * so storage origin (inode, socket and so on) can be safely removed.
+ * No new entries are allowed to be added into the storage at this point.
+ * (Socket is removed from file table at this point for example).
+ */
+void kevent_storage_fini(struct kevent_storage *st)
+{
+	kevent_storage_ready(st, kevent_break, KEVENT_MASK_ALL);
+}
diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
new file mode 100644
index 0000000..f25d696
--- /dev/null
+++ b/kernel/kevent/kevent_user.c
@@ -0,0 +1,1360 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <linux/kevent.h>
+#include <linux/miscdevice.h>
+#include <asm/io.h>
+
+static struct kmem_cache *kevent_cache __read_mostly;
+static struct kmem_cache *kevent_user_cache __read_mostly;
+
+static int kevent_debug_abstime;
+
+/*
+ * kevents are pollable, return POLLIN and POLLRDNORM
+ * when there is at least one ready kevent.
+ */
+static unsigned int kevent_user_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct kevent_user *u = file->private_data;
+	unsigned int mask;
+
+	poll_wait(file, &u->wait, wait);
+	mask = 0;
+
+	if (u->ready_num || u->need_exit)
+		mask |= POLLIN | POLLRDNORM;
+	u->need_exit = 0;
+
+	return mask;
+}
+
+static inline unsigned int kevent_ring_space(struct kevent_user *u)
+{
+	if (!u->pring)
+		return 1;
+
+	if (u->full)
+		return 0;
+
+	return (u->uidx > u->kidx)?
+		(u->uidx - u->kidx):
+		(u->ring_size - (u->kidx - u->uidx));
+}
+
+static inline int kevent_ring_index_inc(unsigned int *pidx, unsigned int size)
+{
+	unsigned int idx = *pidx;
+
+	if (++idx >= size)
+		idx = 0;
+	*pidx = idx;
+	return (idx == 0);
+}
+
+/*
+ * Copies kevent into userspace ring buffer if it was initialized.
+ * Returns 
+ *  0 on success or if ring buffer is not used
+ *  -EAGAIN if there were no place for that kevent
+ *  -EFAULT if copy_to_user() failed.
+ *
+ *  Must be called under kevent_user->ring_lock locked.
+ */
+static int kevent_copy_ring_buffer(struct kevent *k)
+{
+	struct kevent_ring __user *ring;
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+	int err;
+
+	ring = u->pring;
+	if (!ring)
+		return 0;
+
+	if (!kevent_ring_space(u))
+		return -EAGAIN;
+
+	if (copy_to_user(&ring->event[u->kidx], &k->event, sizeof(struct ukevent))) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	kevent_ring_index_inc(&u->kidx, u->ring_size);
+
+	if (u->kidx == u->uidx)
+		u->full = 1;
+
+	if (put_user(u->kidx, &ring->ring_kidx)) {
+		err = -EFAULT;
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+	spin_unlock_irqrestore(&k->ulock, flags);
+	return err;
+}
+
+static struct kevent_user *kevent_user_alloc(struct kevent_ring __user *ring, unsigned int num)
+{
+	struct kevent_user *u;
+
+	u = kmem_cache_zalloc(kevent_user_cache, GFP_KERNEL);
+	if (!u)
+		return NULL;
+
+	INIT_LIST_HEAD(&u->ready_list);
+	spin_lock_init(&u->ready_lock);
+	kevent_stat_init(u);
+	spin_lock_init(&u->kevent_lock);
+	u->kevent_root = RB_ROOT;
+
+	mutex_init(&u->ctl_mutex);
+	init_waitqueue_head(&u->wait);
+	u->need_exit = 0;
+
+	atomic_set(&u->refcnt, 1);
+
+	mutex_init(&u->ring_lock);
+	u->kidx = u->uidx = u->ring_over = u->full = 0;
+
+	u->pring = ring;
+	u->ring_size = num;
+
+	hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+
+	kevent_storage_init(u, &u->st);
+
+	return u;
+}
+
+void kevent_user_free(struct kevent_user *u)
+{
+	kevent_stat_print(u);
+	hrtimer_cancel(&u->timer);
+	kmem_cache_free(kevent_user_cache, u);
+}
+
+static inline int kevent_compare_id(struct kevent_id *left, struct kevent_id *right, 
+		__u32 type_left, __u32 type_right)
+{
+	if (left->raw_u64 > right->raw_u64)
+		return -1;
+
+	if (right->raw_u64 > left->raw_u64)
+		return 1;
+
+	if (type_left > type_right)
+		return -1;
+
+	if (type_left < type_right)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * RCU protects storage list (kevent->storage_entry).
+ * Free entry in RCU callback, it is dequeued from all lists at
+ * this point.
+ */
+
+static void kevent_free_rcu(struct rcu_head *rcu)
+{
+	struct kevent *kevent = container_of(rcu, struct kevent, rcu_head);
+	kmem_cache_free(kevent_cache, kevent);
+}
+
+/*
+ * Must be called under u->ready_lock.
+ * This function unlinks kevent from ready queue.
+ */
+static inline void kevent_unlink_ready(struct kevent *k)
+{
+	list_del(&k->ready_entry);
+	k->flags &= ~KEVENT_READY;
+	k->user->ready_num--;
+}
+
+static void kevent_remove_ready(struct kevent *k)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->ready_lock, flags);
+	if (k->flags & KEVENT_READY)
+		kevent_unlink_ready(k);
+	spin_unlock_irqrestore(&u->ready_lock, flags);
+}
+
+/*
+ * Complete kevent removing - it dequeues kevent from storage list
+ * if it is requested, removes kevent from ready list, drops userspace
+ * control block reference counter and schedules kevent freeing through RCU.
+ */
+static void kevent_finish_user_complete(struct kevent *k, int deq)
+{
+	if (deq)
+		kevent_dequeue(k);
+
+	kevent_remove_ready(k);
+
+	kevent_user_put(k->user);
+	call_rcu(&k->rcu_head, kevent_free_rcu);
+}
+
+/*
+ * Remove from all lists and free kevent.
+ * Must be called under kevent_user->kevent_lock to protect
+ * kevent->kevent_entry removing.
+ */
+static void __kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	kevent_finish_user_complete(k, deq);
+}
+
+/*
+ * Remove kevent from user's list of all events,
+ * dequeue it from storage and decrease user's reference counter,
+ * since this kevent does not exist anymore. That is why it is freed here.
+ */
+static void kevent_finish_user(struct kevent *k, int deq)
+{
+	struct kevent_user *u = k->user;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	rb_erase(&k->kevent_node, &u->kevent_root);
+	k->flags &= ~KEVENT_USER;
+	u->kevent_num--;
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+	kevent_finish_user_complete(k, deq);
+}
+
+static struct kevent *__kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	unsigned long flags;
+	struct kevent *k = NULL;
+
+	if (u->ready_num) {
+		spin_lock_irqsave(&u->ready_lock, flags);
+		if (u->ready_num && !list_empty(&u->ready_list)) {
+			k = list_entry(u->ready_list.next, struct kevent, ready_entry);
+			kevent_unlink_ready(k);
+		}
+		spin_unlock_irqrestore(&u->ready_lock, flags);
+	}
+
+	return k;
+}
+
+static struct kevent *kevent_dequeue_ready_one(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	while (u->ready_num && !k) {
+		k = __kevent_dequeue_ready_one(u);
+
+		if (k && (k->event.req_flags & KEVENT_REQ_LAST_CHECK)) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&k->ulock, flags);
+			k->event.req_flags &= ~KEVENT_REQ_LAST_CHECK;
+			spin_unlock_irqrestore(&k->ulock, flags);
+
+			if (!k->callbacks.callback(k)) {
+				spin_lock_irqsave(&k->ulock, flags);
+				k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+				k->event.ret_flags = 0;
+				k->event.ret_data[0] = k->event.ret_data[1] = 0;
+				spin_unlock_irqrestore(&k->ulock, flags);
+				k = NULL;
+			}
+		} else
+			break;
+	}
+
+	return k;
+}
+
+static inline void kevent_copy_ring(struct kevent *k)
+{
+	unsigned long flags;
+
+	if (!k)
+		return;
+
+	if (kevent_copy_ring_buffer(k)) {
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags |= KEVENT_RET_COPY_FAILED;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Dequeue one entry from user's ready queue.
+ */
+static struct kevent *kevent_dequeue_ready(struct kevent_user *u)
+{
+	struct kevent *k;
+
+	mutex_lock(&u->ring_lock);
+	k = kevent_dequeue_ready_one(u);
+	kevent_copy_ring(k);
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+/*
+ * Dequeue one entry from user's ready queue if there is space in ring buffer.
+ */
+static struct kevent *kevent_dequeue_ready_ring(struct kevent_user *u)
+{
+	struct kevent *k = NULL;
+
+	mutex_lock(&u->ring_lock);
+	if (kevent_ring_space(u)) {
+		k = kevent_dequeue_ready_one(u);
+		kevent_copy_ring(k);
+	}
+	mutex_unlock(&u->ring_lock);
+
+	return k;
+}
+
+static void kevent_complete_ready(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_ONESHOT)
+		/*
+		 * If it is one-shot kevent, it has been removed already from
+		 * origin's queue, so we can easily free it here.
+		 */
+		kevent_finish_user(k, 1);
+	else if (k->event.req_flags & KEVENT_REQ_ET) {
+		unsigned long flags;
+
+		/*
+		 * Edge-triggered behaviour: mark event as clear new one.
+		 */
+
+		spin_lock_irqsave(&k->ulock, flags);
+		k->event.ret_flags = 0;
+		k->event.ret_data[0] = k->event.ret_data[1] = 0;
+		spin_unlock_irqrestore(&k->ulock, flags);
+	}
+}
+
+/*
+ * Search a kevent inside kevent tree for given ukevent.
+ */
+static struct kevent *__kevent_search(struct kevent_id *id, __u32 type, struct kevent_user *u)
+{
+	struct kevent *k, *ret = NULL;
+	struct rb_node *n = u->kevent_root.rb_node;
+	int cmp;
+
+	while (n) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		cmp = kevent_compare_id(&k->event.id, id, k->event.type, type);
+
+		if (cmp > 0)
+			n = n->rb_right;
+		else if (cmp < 0)
+			n = n->rb_left;
+		else {
+			ret = k;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Search and modify kevent according to provided ukevent.
+ */
+static int kevent_modify(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, uk->type, u);
+	if (k) {
+		spin_lock(&k->ulock);
+		k->event.event = uk->event;
+		k->event.req_flags = uk->req_flags;
+		k->event.ret_flags = 0;
+		spin_unlock(&k->ulock);
+		kevent_requeue(k);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Remove kevent which matches provided ukevent.
+ */
+static int kevent_remove(struct ukevent *uk, struct kevent_user *u)
+{
+	int err = -ENODEV;
+	struct kevent *k;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, uk->type, u);
+	if (k) {
+		__kevent_finish_user(k, 1);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Detaches userspace control block from file descriptor
+ * and decrease it's reference counter.
+ * No new kevents can be added or removed from any list at this point.
+ */
+static int kevent_user_release(struct inode *inode, struct file *file)
+{
+	struct kevent_user *u = file->private_data;
+	struct kevent *k;
+	struct rb_node *n;
+
+	for (n = rb_first(&u->kevent_root); n; n = rb_next(n)) {
+		k = rb_entry(n, struct kevent, kevent_node);
+		kevent_finish_user(k, 1);
+	}
+
+	kevent_user_put(u);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Read requested number of ukevents in one shot.
+ */
+static struct ukevent *kevent_get_user(unsigned int num, void __user *arg)
+{
+	struct ukevent *ukev;
+
+	ukev = kmalloc(sizeof(struct ukevent) * num, GFP_KERNEL);
+	if (!ukev)
+		return NULL;
+
+	if (copy_from_user(ukev, arg, sizeof(struct ukevent) * num)) {
+		kfree(ukev);
+		return NULL;
+	}
+
+	return ukev;
+}
+
+static int kevent_mark_ready(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err = -ENODEV;
+	unsigned long flags;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	k = __kevent_search(&uk->id, uk->type, u);
+	if (k) {
+		spin_lock(&k->st->lock);
+		kevent_ready(k, 1);
+		spin_unlock(&k->st->lock);
+		err = 0;
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Mark appropriate kevents as ready.
+ * If number of events is zero just wake up one listener.
+ */
+static int kevent_user_ctl_ready(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = -EINVAL, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	if (num > u->kevent_num)
+		return err;
+	
+	if (!num) {
+		u->need_exit = 1;
+		wake_up(&u->wait);
+		return 0;
+	}
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				err = kevent_mark_ready(&ukev[i], u);
+				if (err) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], &ukev[i], sizeof(struct ukevent));
+					rnum++;
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		err = kevent_mark_ready(&uk, u);
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = num - rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and modify appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_modify(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_modify(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_modify(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Read from userspace all ukevents and remove appropriate kevents.
+ * If provided number of ukevents is more that threshold, it is faster
+ * to allocate a room for them and copy in one shot instead of copy
+ * one-by-one and then process them.
+ */
+static int kevent_user_ctl_remove(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err = 0, i;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	if (num > u->kevent_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (kevent_remove(&ukev[i], u))
+					ukev[i].ret_flags |= KEVENT_RET_BROKEN;
+				ukev[i].ret_flags |= KEVENT_RET_DONE;
+			}
+			if (copy_to_user(arg, ukev, num*sizeof(struct ukevent)))
+				err = -EFAULT;
+			kfree(ukev);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (kevent_remove(&uk, u))
+			uk.ret_flags |= KEVENT_RET_BROKEN;
+
+		uk.ret_flags |= KEVENT_RET_DONE;
+
+		if (copy_to_user(arg, &uk, sizeof(struct ukevent))) {
+			err = -EFAULT;
+			break;
+		}
+
+		arg += sizeof(struct ukevent);
+	}
+out:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/*
+ * Queue kevent into userspace control block and increase
+ * it's reference counter.
+ */
+static int kevent_user_enqueue(struct kevent_user *u, struct kevent *new)
+{
+	unsigned long flags;
+	struct rb_node **p = &u->kevent_root.rb_node, *parent = NULL;
+	struct kevent *k;
+	int err = 0, cmp;
+
+	spin_lock_irqsave(&u->kevent_lock, flags);
+	while (*p) {
+		parent = *p;
+		k = rb_entry(parent, struct kevent, kevent_node);
+
+		cmp = kevent_compare_id(&k->event.id, &new->event.id,
+				k->event.type, new->event.type);
+		if (cmp > 0)
+			p = &parent->rb_right;
+		else if (cmp < 0)
+			p = &parent->rb_left;
+		else {
+			err = -EEXIST;
+			break;
+		}
+	}
+	if (likely(!err)) {
+		rb_link_node(&new->kevent_node, parent, p);
+		rb_insert_color(&new->kevent_node, &u->kevent_root);
+		new->flags |= KEVENT_USER;
+		u->kevent_num++;
+		kevent_user_get(u);
+	}
+	spin_unlock_irqrestore(&u->kevent_lock, flags);
+
+	return err;
+}
+
+/*
+ * Add kevent from both kernel and userspace users.
+ * This function allocates and queues kevent, returns negative value
+ * on error, positive if kevent is ready immediately and zero
+ * if kevent has been queued.
+ */
+int kevent_user_add_ukevent(struct ukevent *uk, struct kevent_user *u)
+{
+	struct kevent *k;
+	int err;
+
+	k = kmem_cache_alloc(kevent_cache, GFP_KERNEL);
+	if (!k) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&k->event, uk, sizeof(struct ukevent));
+	INIT_RCU_HEAD(&k->rcu_head);
+
+	k->event.ret_flags = 0;
+
+	err = kevent_init(k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+	k->user = u;
+	kevent_stat_total(u);
+	err = kevent_user_enqueue(u, k);
+	if (err) {
+		kmem_cache_free(kevent_cache, k);
+		goto err_out_exit;
+	}
+
+	err = kevent_enqueue(k);
+	if (err) {
+		memcpy(uk, &k->event, sizeof(struct ukevent));
+		kevent_finish_user(k, 0);
+		goto err_out_exit;
+	}
+
+	return 0;
+
+err_out_exit:
+	if (err < 0) {
+		uk->ret_flags |= KEVENT_RET_BROKEN | KEVENT_RET_DONE;
+		uk->ret_data[1] = err;
+	} else if (err > 0)
+		uk->ret_flags |= KEVENT_RET_DONE;
+	return err;
+}
+
+
+/*
+ * Copy all ukevents from userspace, allocate kevent for each one
+ * and add them into appropriate kevent_storages,
+ * e.g. sockets, inodes and so on...
+ * Ready events will replace ones provided by used and number
+ * of ready events is returned.
+ * User must check ret_flags field of each ukevent structure
+ * to determine if it is fired or failed event.
+ */
+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg)
+{
+	int err, cerr = 0, rnum = 0, i;
+	void __user *orig = arg;
+	struct ukevent uk;
+
+	mutex_lock(&u->ctl_mutex);
+
+	err = -EINVAL;
+	if (num > KEVENT_MIN_BUFFS_ALLOC) {
+		struct ukevent *ukev;
+
+		ukev = kevent_get_user(num, arg);
+		if (ukev) {
+			for (i = 0; i < num; ++i) {
+				if (!kevent_event_is_allowed(&ukev[i])) {
+					if (i != rnum)
+						memcpy(&ukev[rnum], 
+							&ukev[i], 
+							sizeof(struct ukevent));
+					rnum++;
+				} else {
+					err = kevent_user_add_ukevent(&ukev[i], u);
+					if (err) {
+						kevent_stat_im(u);
+						if (i != rnum)
+							memcpy(&ukev[rnum], 
+								&ukev[i], 
+								sizeof(struct ukevent));
+						rnum++;
+					}
+				}
+			}
+			if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent)))
+				cerr = -EFAULT;
+			kfree(ukev);
+			goto out_setup;
+		}
+	}
+
+	for (i = 0; i < num; ++i) {
+		if (copy_from_user(&uk, arg, sizeof(struct ukevent))) {
+			cerr = -EFAULT;
+			break;
+		}
+		arg += sizeof(struct ukevent);
+
+		if (!kevent_event_is_allowed(&uk)) {
+			err = 1;
+		} else {
+			err = kevent_user_add_ukevent(&uk, u);
+			if (err)
+				kevent_stat_im(u);
+		}
+		if (err) {
+			if (copy_to_user(orig, &uk, sizeof(struct ukevent))) {
+				cerr = -EFAULT;
+				break;
+			}
+			orig += sizeof(struct ukevent);
+			rnum++;
+		}
+	}
+
+out_setup:
+	if (cerr < 0) {
+		err = cerr;
+		goto out_remove;
+	}
+
+	err = rnum;
+out_remove:
+	mutex_unlock(&u->ctl_mutex);
+
+	return err;
+}
+
+/* Used to wakeup waiting syscalls in case high-resolution timer is used. */
+static int kevent_user_wake(struct hrtimer *timer)
+{
+	struct kevent_user *u = container_of(timer, struct kevent_user, timer);
+
+	u->need_exit = 1;
+	wake_up(&u->wait);
+
+	return HRTIMER_NORESTART;
+}
+
+
+/*
+ * In nonblocking mode it returns as many events as possible, but not more than @max_nr.
+ * In blocking mode it waits until timeout or if at least @min_nr events are ready.
+ */
+static int kevent_user_wait(struct file *file, struct kevent_user *u,
+		unsigned int min_nr, unsigned int max_nr, struct timespec timeout,
+		void __user *buf, unsigned int flags)
+{
+	struct kevent *k;
+	int num = 0;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			return -EINVAL;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || u->need_exit, tm);
+	}
+	u->need_exit = 0;
+
+	while (num < max_nr && ((k = kevent_dequeue_ready(u)) != NULL)) {
+		if (copy_to_user(buf + num*sizeof(struct ukevent),
+					&k->event, sizeof(struct ukevent))) {
+			if (num == 0)
+				num = -EFAULT;
+			break;
+		}
+		kevent_complete_ready(k);
+		++num;
+		kevent_stat_wait(u);
+	}
+
+	return num;
+}
+
+struct file_operations kevent_user_fops = {
+	.release	= kevent_user_release,
+	.poll		= kevent_user_poll,
+	.owner		= THIS_MODULE,
+};
+
+static int kevent_ctl_process(struct file *file, unsigned int cmd, unsigned int num, void __user *arg)
+{
+	int err;
+	struct kevent_user *u = file->private_data;
+
+	switch (cmd) {
+	case KEVENT_CTL_ADD:
+		err = kevent_user_ctl_add(u, num, arg);
+		break;
+	case KEVENT_CTL_REMOVE:
+		err = kevent_user_ctl_remove(u, num, arg);
+		break;
+	case KEVENT_CTL_MODIFY:
+		err = kevent_user_ctl_modify(u, num, arg);
+		break;
+	case KEVENT_CTL_READY:
+		err = kevent_user_ctl_ready(u, num, arg);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Used to get ready kevents from queue.
+ * @ctl_fd - kevent control descriptor which must be obtained through kevent_ctl(KEVENT_CTL_INIT).
+ * @min_nr - minimum number of ready kevents.
+ * @max_nr - maximum number of ready kevents.
+ * @timeout - time to wait until some events are ready.
+ * @buf - buffer to place ready events.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ */
+asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr,
+		struct timespec timeout, struct ukevent __user *buf, unsigned flags)
+{
+	int err = -EINVAL;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	err = kevent_user_wait(file, u, min_nr, max_nr, timeout, buf, flags);
+out_fput:
+	fput(file);
+	return err;
+}
+
+static struct vfsmount *kevent_mnt __read_mostly;
+
+static int kevent_get_sb(struct file_system_type *fs_type, int flags,
+		   const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "kevent", NULL, 0xaabbccdd, mnt);
+}
+
+static struct file_system_type kevent_fs_type = {
+	.name		= "keventfs",
+	.get_sb		= kevent_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int keventfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations keventfs_dentry_operations = {
+	.d_delete	= keventfs_delete_dentry,
+};
+
+asmlinkage long sys_kevent_init(struct kevent_ring __user *ring, unsigned int num, unsigned int flags)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int err = -ENFILE, fd;
+	struct kevent_user *u;
+
+	if ((ring && !num) || (!ring && num) || (num == 1))
+		return -EINVAL;
+
+	file = get_empty_filp();
+	if (!file)
+		goto err_out_exit;
+
+	inode = new_inode(kevent_mnt->mnt_sb);
+	if (!inode)
+		goto err_out_fput;
+
+	inode->i_fop = &kevent_user_fops;
+
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	err = get_unused_fd();
+	if (err < 0)
+		goto err_out_iput;
+	fd = err;
+
+	err = -ENOMEM;
+	u = kevent_user_alloc(ring, num);
+	if (!u)
+		goto err_out_put_fd;
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(kevent_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto err_out_free;
+	dentry->d_op = &keventfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(kevent_mnt);
+	file->f_dentry = dentry;
+	file->f_mapping = inode->i_mapping;
+	file->f_pos = 0;
+	file->f_flags = O_RDONLY;
+	file->f_op = &kevent_user_fops;
+	file->f_mode = FMODE_READ;
+	file->f_version = 0;
+	file->private_data = u;
+
+	fd_install(fd, file);
+
+	return fd;
+
+err_out_free:
+	kmem_cache_free(kevent_user_cache, u);
+err_out_put_fd:
+	put_unused_fd(fd);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	put_filp(file);
+err_out_exit:
+	return err;
+}
+
+/*
+ * Commits user's index (consumer index).
+ * Must be called under u->ring_lock mutex held.
+ */
+static int __kevent_user_commit(struct kevent_user *u, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EOVERFLOW, comm = 0;
+	struct kevent_ring __user *ring = u->pring;
+
+	if (!ring) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	if (new_uidx >= u->ring_size) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if ((over != u->ring_over - 1) && (over != u->ring_over))
+		goto err_out_exit;
+
+	if (u->uidx < u->kidx && new_uidx > u->kidx) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	if (new_uidx > u->uidx) {
+		if (over != u->ring_over)
+			goto err_out_exit;
+
+		comm = new_uidx - u->uidx;
+		u->uidx = new_uidx;
+		u->full = 0;
+	} else if (new_uidx < u->uidx) {
+		comm = u->ring_size - (u->uidx - new_uidx);
+		u->uidx = new_uidx;
+		u->full = 0;
+		u->ring_over++;
+
+		if (put_user(u->ring_over, &ring->ring_over)) {
+			err = -EFAULT;
+			goto err_out_exit;
+		}
+	}
+
+	return comm;
+
+err_out_exit:
+	return err;
+}
+
+/*
+ * This syscall is used to perform waiting until there is free space in the ring
+ * buffer, in that case some events will be copied there.
+ * Function returns number of actually copied ready events in ring buffer.
+ * After this function is completed userspace ring->ring_kidx will be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @num - number of kevents to process.
+ * @old_uidx - the last index user is aware of.
+ * @timeout - time to wait until there is free space in kevent queue.
+ * @flags - various flags (see include/linux/ukevent.h KEVENT_FLAGS_*).
+ *
+ * When we need to commit @num events, it means we should just remove first @num
+ * kevents from ready queue and copy them into the buffer. 
+ * Kevents will be copied into ring buffer in order they were placed into ready queue.
+ * One-shot kevents will be removed here, since there is no way they can be reused.
+ * Edge-triggered events will be requeued here for better performance.
+ */
+asmlinkage long sys_kevent_wait(int ctl_fd, unsigned int num, unsigned int old_uidx, 
+		struct timespec timeout, unsigned int flags)
+{
+	int err = -EINVAL, copied = 0;
+	struct file *file;
+	struct kevent_user *u;
+	struct kevent *k;
+	struct kevent_ring __user *ring;
+	long tm = MAX_SCHEDULE_TIMEOUT;
+	unsigned int i;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	ring = u->pring;
+	if (!ring || num > u->ring_size)
+		goto out_fput;
+#if 0
+	/*
+	 * Allow to immediately update ring index, but it is not supported,
+	 * since syscall() has limited number of arguments which is actually
+	 * a good idea - use kevent_commit() instead.
+	 */
+	if ((u->uidx != new_uidx) && (new_uidx != 0xffffffff)) {
+		mutex_lock(&u->ring_lock);
+		__kevent_user_commit(u, new_uidx, over);
+		mutex_unlock(&u->ring_lock);
+	}
+#endif
+
+	if (!(file->f_flags & O_NONBLOCK)) {
+		if (!timespec_valid(&timeout))
+			goto out_fput;
+
+		if (flags & KEVENT_FLAGS_ABSTIME) {
+			hrtimer_cancel(&u->timer);
+			hrtimer_init(&u->timer, CLOCK_REALTIME, HRTIMER_ABS);
+			u->timer.expires = ktime_set(timeout.tv_sec, timeout.tv_nsec);
+			u->timer.function = &kevent_user_wake;
+			hrtimer_start(&u->timer, u->timer.expires, HRTIMER_ABS);
+			if (unlikely(kevent_debug_abstime == 0)) {
+				printk(KERN_INFO "kevent: author was wrong, "
+						"someone uses absolute time in %s, "
+						"please report to remove this warning.\n", __func__);
+				kevent_debug_abstime = 1;
+			}
+		} else {
+			tm = timespec_to_jiffies(&timeout);
+		}
+
+		wait_event_interruptible_timeout(u->wait,
+			((u->ready_num >= 1) && kevent_ring_space(u)) || 
+				u->need_exit || old_uidx != u->uidx,
+				tm);
+	}
+	u->need_exit = 0;
+
+	for (i=0; i<num; ++i) {
+		k = kevent_dequeue_ready_ring(u);
+		if (!k)
+			break;
+		kevent_complete_ready(k);
+
+		if (k->event.ret_flags & KEVENT_RET_COPY_FAILED)
+			break;
+		kevent_stat_ring(u);
+		copied++;
+	}
+
+	fput(file);
+
+	return copied;
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to commit events in ring buffer, i.e. mark appropriate
+ * entries as unused by userspace so subsequent kevent_wait() could overwrite them.
+ * This fucntion returns actual number of kevents which were committed.
+ * After this function is completed userspace ring->ring_over can be updated.
+ *
+ * @ctl_fd - kevent file descriptor.
+ * @new_uidx - the last committed kevent.
+ * @over - number of overflows given queue had.
+ */
+asmlinkage long sys_kevent_commit(int ctl_fd, unsigned int new_uidx, unsigned int over)
+{
+	int err = -EINVAL, comm = 0;
+	struct file *file;
+	struct kevent_user *u;
+
+	file = fget(ctl_fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+	u = file->private_data;
+
+	mutex_lock(&u->ring_lock);
+	err = __kevent_user_commit(u, new_uidx, over);
+	if (err < 0)
+		goto err_out_unlock;
+	comm = err;
+	mutex_unlock(&u->ring_lock);
+
+	fput(file);
+
+	return comm;
+
+err_out_unlock:
+	mutex_unlock(&u->ring_lock);
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * This syscall is used to perform various control operations
+ * on given kevent queue, which is obtained through kevent file descriptor @fd.
+ * @cmd - type of operation.
+ * @num - number of kevents to be processed.
+ * @arg - pointer to array of struct ukevent.
+ */
+asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, struct ukevent __user *arg)
+{
+	int err = -EINVAL;
+	struct file *file;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kevent_user_fops)
+		goto out_fput;
+
+	err = kevent_ctl_process(file, cmd, num, arg);
+
+out_fput:
+	fput(file);
+	return err;
+}
+
+/*
+ * Kevent subsystem initialization - create caches and register
+ * filesystem to get control file descriptors from.
+ */
+static int __init kevent_user_init(void)
+{
+	int err = 0;
+
+	kevent_cache = kmem_cache_create("kevent_cache",
+			sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
+	
+	kevent_user_cache = kmem_cache_create("kevent_user_cache",
+			sizeof(struct kevent_user), 0, SLAB_PANIC, NULL, NULL);
+	
+	err = register_filesystem(&kevent_fs_type);
+	if (err)
+		goto err_out_exit;
+	
+	kevent_mnt = kern_mount(&kevent_fs_type);
+	err = PTR_ERR(kevent_mnt);
+	if (IS_ERR(kevent_mnt))
+		goto err_out_unreg;
+
+	printk(KERN_INFO "KEVENT subsystem has been successfully registered.\n");
+
+	return 0;
+
+err_out_unreg:
+	unregister_filesystem(&kevent_fs_type);
+err_out_exit:
+	kmem_cache_destroy(kevent_cache);
+	return err;
+}
+
+module_init(kevent_user_init);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d7306d0..a3fc075 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -123,6 +123,14 @@ cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
 
+cond_syscall(sys_kevent_get_events);
+cond_syscall(sys_kevent_ctl);
+cond_syscall(sys_kevent_wait);
+cond_syscall(sys_kevent_commit);
+cond_syscall(sys_kevent_init);
+
+cond_syscall(sys_aio_sendfile);
+
 /* mmu depending weak syscall entries */
 cond_syscall(sys_mprotect);
 cond_syscall(sys_msync);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 4/10] kevent: Socket notifications.
  2007-01-08 19:25       ` [take31 3/10] kevent: poll/select() notifications Evgeniy Polyakov
@ 2007-01-08 19:25         ` Evgeniy Polyakov
  2007-01-08 19:25           ` [take31 5/10] kevent: Timer notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Socket notifications.

This patch includes socket send/recv/accept notifications.
Using trivial web server based on kevent and this features
instead of epoll it's performance increased more than noticebly.
More details about various benchmarks and server itself 
(evserver_kevent.c) can be found on project's homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/inode.c b/fs/inode.c
index bf21dc6..82817b1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/kevent.h>
 #include <linux/mount.h>
 
 /*
@@ -164,12 +165,18 @@ static struct inode *alloc_inode(struct super_block *sb)
 		}
 		inode->i_private = NULL;
 		inode->i_mapping = mapping;
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+		kevent_storage_init(inode, &inode->st);
+#endif
 	}
 	return inode;
 }
 
 void destroy_inode(struct inode *inode) 
 {
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	kevent_storage_fini(&inode->st);
+#endif
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
diff --git a/include/net/sock.h b/include/net/sock.h
index 03684e7..d840399 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -49,6 +49,7 @@
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/mm.h>
 #include <linux/security.h>
+#include <linux/kevent.h>
 
 #include <linux/filter.h>
 
@@ -451,6 +452,21 @@ static inline int sk_stream_memory_free(struct sock *sk)
 
 extern void sk_stream_rfree(struct sk_buff *skb);
 
+struct socket_alloc {
+	struct socket socket;
+	struct inode vfs_inode;
+};
+
+static inline struct socket *SOCKET_I(struct inode *inode)
+{
+	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+}
+
+static inline struct inode *SOCK_INODE(struct socket *socket)
+{
+	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
 static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 {
 	skb->sk = sk;
@@ -478,6 +494,7 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 		sk->sk_backlog.tail = skb;
 	}
 	skb->next = NULL;
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 }
 
 #define sk_wait_event(__sk, __timeo, __condition)		\
@@ -679,21 +696,6 @@ static inline struct kiocb *siocb_to_kiocb(struct sock_iocb *si)
 	return si->kiocb;
 }
 
-struct socket_alloc {
-	struct socket socket;
-	struct inode vfs_inode;
-};
-
-static inline struct socket *SOCKET_I(struct inode *inode)
-{
-	return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
-}
-
-static inline struct inode *SOCK_INODE(struct socket *socket)
-{
-	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
-}
-
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b7d8317..2763b30 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -864,6 +864,7 @@ static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
+			kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 			if (!inet_csk_ack_scheduled(sk))
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						          (3 * TCP_RTO_MIN) / 4,
diff --git a/kernel/kevent/kevent_socket.c b/kernel/kevent/kevent_socket.c
new file mode 100644
index 0000000..d1a2701
--- /dev/null
+++ b/kernel/kevent/kevent_socket.c
@@ -0,0 +1,144 @@
+/*
+ * 	kevent_socket.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/tcp.h>
+#include <linux/kevent.h>
+
+#include <net/sock.h>
+#include <net/request_sock.h>
+#include <net/inet_connection_sock.h>
+
+static int kevent_socket_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	unsigned int events = SOCKET_I(inode)->ops->poll(SOCKET_I(inode)->file, SOCKET_I(inode), NULL);
+
+	if ((events & (POLLIN | POLLRDNORM)) && (k->event.event & (KEVENT_SOCKET_RECV | KEVENT_SOCKET_ACCEPT)))
+		return 1;
+	if ((events & (POLLOUT | POLLWRNORM)) && (k->event.event & KEVENT_SOCKET_SEND))
+		return 1;
+	if (events & (POLLERR | POLLHUP | POLLRDHUP | POLLREMOVE))
+		return -1;
+	return 0;
+}
+
+int kevent_socket_enqueue(struct kevent *k)
+{
+	struct inode *inode;
+	struct socket *sock;
+	int err = -EBADF;
+
+	sock = sockfd_lookup(k->event.id.raw[0], &err);
+	if (!sock)
+		goto err_out_exit;
+
+	inode = igrab(SOCK_INODE(sock));
+	if (!inode)
+		goto err_out_fput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	sockfd_put(sock);
+err_out_exit:
+	return err;
+}
+
+int kevent_socket_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct socket *sock;
+
+	kevent_storage_dequeue(k->st, k);
+
+	sock = SOCKET_I(inode);
+	iput(inode);
+	sockfd_put(sock);
+
+	return 0;
+}
+
+void kevent_socket_notify(struct sock *sk, u32 event)
+{
+	if (sk->sk_socket)
+		kevent_storage_ready(&SOCK_INODE(sk->sk_socket)->st, NULL, event);
+}
+
+/*
+ * It is required for network protocols compiled as modules, like IPv6.
+ */
+EXPORT_SYMBOL_GPL(kevent_socket_notify);
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key kevent_sock_key;
+
+void kevent_socket_reinit(struct socket *sock)
+{
+	struct inode *inode = SOCK_INODE(sock);
+
+	lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+}
+
+void kevent_sk_reinit(struct sock *sk)
+{
+	if (sk->sk_socket) {
+		struct inode *inode = SOCK_INODE(sk->sk_socket);
+
+		lockdep_set_class(&inode->st.lock, &kevent_sock_key);
+	}
+}
+#endif
+static int __init kevent_init_socket(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_socket_callback,
+		.enqueue = &kevent_socket_enqueue,
+		.dequeue = &kevent_socket_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_SOCKET);
+}
+module_init(kevent_init_socket);
diff --git a/net/core/sock.c b/net/core/sock.c
index 0ed5b4f..e687f54 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1393,6 +1393,7 @@ static void sock_def_wakeup(struct sock *sk)
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 		wake_up_interruptible_all(sk->sk_sleep);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_error_report(struct sock *sk)
@@ -1402,6 +1403,7 @@ static void sock_def_error_report(struct sock *sk)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,0,POLL_ERR); 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_readable(struct sock *sk, int len)
@@ -1411,6 +1413,7 @@ static void sock_def_readable(struct sock *sk, int len)
 		wake_up_interruptible(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
 }
 
 static void sock_def_write_space(struct sock *sk)
@@ -1430,6 +1433,7 @@ static void sock_def_write_space(struct sock *sk)
 	}
 
 	read_unlock(&sk->sk_callback_lock);
+	kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 }
 
 static void sock_def_destruct(struct sock *sk)
@@ -1480,6 +1484,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_state		=	TCP_CLOSE;
 	sk->sk_socket		=	sock;
 
+	kevent_sk_reinit(sk);
+
 	sock_set_flag(sk, SOCK_ZAPPED);
 
 	if(sock)
@@ -1546,8 +1552,10 @@ void fastcall release_sock(struct sock *sk)
 	if (sk->sk_backlog.tail)
 		__release_sock(sk);
 	sk->sk_lock.owner = NULL;
-	if (waitqueue_active(&sk->sk_lock.wq))
+	if (waitqueue_active(&sk->sk_lock.wq)) {
 		wake_up(&sk->sk_lock.wq);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV|KEVENT_SOCKET_SEND);
+	}
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
diff --git a/net/core/stream.c b/net/core/stream.c
index d1d7dec..2878c2a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -36,6 +36,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible(sk->sk_sleep);
 		if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(sock, 2, POLL_OUT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 	}
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c701f6a..84ce4c5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3127,6 +3127,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bf7a224..bca9f2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -61,6 +61,7 @@
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/kevent.h>
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -1391,6 +1392,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	   	reqsk_free(req);
 	} else {
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		kevent_socket_notify(sk, KEVENT_SOCKET_ACCEPT);
 	}
 	return 0;
 
diff --git a/net/socket.c b/net/socket.c
index 4e39631..776dc2e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -84,6 +84,7 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -496,6 +497,8 @@ static struct socket *sock_alloc(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 
+	kevent_socket_reinit(sock);
+
 	get_cpu_var(sockets_in_use)++;
 	put_cpu_var(sockets_in_use);
 	return sock;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2f208c7..835e20f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1563,8 +1563,10 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct scm_cookie tmp_scm;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
+	struct sock *other;
 	int noblock = flags & MSG_DONTWAIT;
 	struct sk_buff *skb;
+
 	int err;
 
 	err = -EOPNOTSUPP;
@@ -1580,6 +1582,12 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		goto out_unlock;
 
 	wake_up_interruptible(&u->peer_wait);
+	other =unix_peer_get(sk);
+	if (other) {
+		kevent_socket_notify(other, KEVENT_SOCKET_SEND);
+		sock_put(other);
+	} else
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -1674,7 +1682,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 {
 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
 	struct scm_cookie tmp_scm;
-	struct sock *sk = sock->sk;
+	struct sock *sk = sock->sk, *other;
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=msg->msg_name;
 	int copied = 0;
@@ -1803,6 +1811,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 	} while (size);
 
+	other =unix_peer_get(sk);
+	if (other) {
+		kevent_socket_notify(other, KEVENT_SOCKET_SEND);
+		sock_put(other);
+	}
+	if (sk->sk_shutdown & RCV_SHUTDOWN || !other)
+		kevent_socket_notify(sk, KEVENT_SOCKET_RECV);
+
 	mutex_unlock(&u->readlock);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
@@ -1824,6 +1840,10 @@ static int unix_shutdown(struct socket *sock, int mode)
 			sock_hold(other);
 		unix_state_wunlock(sk);
 		sk->sk_state_change(sk);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
+		if (other)
+			kevent_socket_notify(other, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
+		kevent_socket_notify(sk, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 
 		if (other &&
 			(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 0/10] kevent: Generic event handling mechanism.
       [not found] <3154985aa0591036@2ka.mipt.ru>
                   ` (3 preceding siblings ...)
  2006-12-29 12:25 ` [take30 0/9] " Evgeniy Polyakov
@ 2007-01-08 19:25 ` Evgeniy Polyakov
  2007-01-08 19:25   ` [take31 1/10] kevent: Description Evgeniy Polyakov
  4 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar, linux-fsdevel


Generic event handling mechanism.

Kevent is a generic subsytem which allows to handle event notifications.
It supports both level and edge triggered events. It is similar to
poll/epoll in some cases, but it is more scalable, it is faster and
allows to work with essentially eny kind of events.

Events are provided into kernel through control syscall and can be read
back through ring buffer or using usual syscalls.
Kevent update (i.e. readiness switching) happens directly from internals
of the appropriate state machine of the underlying subsytem (like
network, filesystem, timer or any other).

Homepage:
http://tservice.net.ru/~s0mbre/old/?section=projects&item=kevent

Documentation page:
http://linux-net.osdl.org/index.php/Kevent

Consider for inclusion.

P.S. If you want to be removed from Cc: list just drop me a mail.

Changes from 'take30' patchset:
 * AIO state machine.
 * aio_sendfile() implementation.
 * moved kevent_user_get/kevent_user_put into header.
 * use *zalloc where needed.

Changes from 'take29' patchset:
 * new private userspace notifications - allows to queue any userspace private 
 	event and then mark it as ready using kevent_ctl(KEVENT_READY) command
 * KEVENT_REQ_READY flag - if set kevent will be marked as ready at enqueue time
 * port to 2.6.20-rc2 tree (54abb5fcdae74a811ed440ec6556cabc6b24f404 commit)
 * use struct kmem_cache instead of kmem_cache_t
 * added notificaion type into search key, this allows to have the same id for 
 	different types of notifications

Changes from 'take28' patchset:
 * optimized af_unix to use socket notifications
 * changed ALWAYS_QUEUE behaviour with poll/select notifications - previously
 	kevent was not queued into poll wait queue when ALWAYS_QUEUE flag
	is set
 * added KEVENT_POLL_POLLRDHUP definition into ukevent.h header
 * libevent-1.2 patch (Jamal, your request is completed, so I'm waiting two weeks
 	before starting final countdown :)
	All regression tests passed successfully except test_evbuffer(), which is
	crashed on my amd64 linux 2.6 test machine for all types of notifications,
	probably it was fixed in libevent-1.2a version, I did not check.
	Patch and README can be found at project homepage.

Changes from 'take27' patchset:
 * made kevent default yes in non embedded case.
 * added falgs to callback structures - currently used to check if kevent
 	can be requested from kernelspace only (posix timers) or 
	userspace (all others)

Changes from 'take26' patchset:
 * made kevent visible in config only in case of embedded setup.
 * added comment about KEVENT_MAX number.
 * spell fix.

Changes from 'take25' patchset:
 * use timespec as timeout parameter.
 * added high-resolution timer to handle absolute timeouts.
 * added flags to waiting and initialization syscalls.
 * kevent_commit() has new_uidx parameter.
 * kevent_wait() has old_uidx parameter, which, if not equal to u->uidx,
 	results in immediate wakeup (usefull for the case when entries
	are added asynchronously from kernel (not supported for now)).
 * added interface to mark any event as ready.
 * event POSIX timers support.
 * return -ENOSYS if there is no registered event type.
 * provided file descriptor must be checked for fifo type (spotted by Eric Dumazet).
 * signal notifications.
 * documentation update.
 * lighttpd patch updated (the latest benchmarks with lighttpd patch can be found in blog).

Changes from 'take24' patchset:
 * new (old (new)) ring buffer implementation with kernel and user indexes.
 * added initialization syscall instead of opening /dev/kevent
 * kevent_commit() syscall to commit ring buffer entries
 * changed KEVENT_REQ_WAKEUP_ONE flag to KEVENT_REQ_WAKEUP_ALL, kevent wakes
   only first thread always if that flag is not set
 * KEVENT_REQ_ALWAYS_QUEUE flag. If set, kevent will be queued into ready queue
   instead of copying back to userspace when kevent is ready immediately when
   it is added.
 * lighttpd patch (Hail! Although nothing really outstanding compared to epoll)

Changes from 'take23' patchset:
 * kevent PIPE notifications
 * KEVENT_REQ_LAST_CHECK flag, which allows to perform last check at dequeueing time
 * fixed poll/select notifications (were broken due to tree manipulations)
 * made Documentation/kevent.txt look nice in 80-col terminal
 * fix for copy_to_user() failure report for the first kevent (Andrew Morton)
 * minor function renames

Changes from 'take22' patchset:
 * new ring buffer implementation in process' memory
 * wakeup-one-thread flag
 * edge-triggered behaviour

Changes from 'take21' patchset:
 * minor cleanups (different return values, removed unneded variables, whitespaces and so on)
 * fixed bug in kevent removal in case when kevent being removed
   is the same as overflow_kevent (spotted by Eric Dumazet)

Changes from 'take20' patchset:
 * new ring buffer implementation
 * removed artificial limit on possible number of kevents

Changes from 'take19' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take18' patchset:
 * use __init instead of __devinit
 * removed 'default N' from config for user statistic
 * removed kevent_user_fini() since kevent can not be unloaded
 * use KERN_INFO for statistic output

Changes from 'take17' patchset:
 * Use RB tree instead of hash table. 
	At least for a web sever, frequency of addition/deletion of new kevent 
	is comparable with number of search access, i.e. most of the time events 
	are added, accesed only couple of times and then removed, so it justifies 
	RB tree usage over AVL tree, since the latter does have much slower deletion 
	time (max O(log(N)) compared to 3 ops), 
	although faster search time (1.44*O(log(N)) vs. 2*O(log(N))). 
	So for kevents I use RB tree for now and later, when my AVL tree implementation 
	is ready, it will be possible to compare them.
 * Changed readiness check for socket notifications.

With both above changes it is possible to achieve more than 3380 req/second compared to 2200, 
sometimes 2500 req/second for epoll() for trivial web-server and httperf client on the same
hardware.
It is possible that above kevent limit is due to maximum allowed kevents in a time limit, which is
4096 events.

Changes from 'take16' patchset:
 * misc cleanups (__read_mostly, const ...)
 * created special macro which is used for mmap size (number of pages) calculation
 * export kevent_socket_notify(), since it is used in network protocols which can be 
	built as modules (IPv6 for example)

Changes from 'take15' patchset:
 * converted kevent_timer to high-resolution timers, this forces timer API update at
	http://linux-net.osdl.org/index.php/Kevent
 * use struct ukevent* instead of void * in syscalls (documentation has been updated)
 * added warning in kevent_add_ukevent() if ring has broken index (for testing)

Changes from 'take14' patchset:
 * added kevent_wait()
    This syscall waits until either timeout expires or at least one event
    becomes ready. It also commits that @num events from @start are processed
    by userspace and thus can be be removed or rearmed (depending on it's flags).
    It can be used for commit events read by userspace through mmap interface.
    Example userspace code (evtest.c) can be found on project's homepage.
 * added socket notifications (send/recv/accept)

Changes from 'take13' patchset:
 * do not get lock aroung user data check in __kevent_search()
 * fail early if there were no registered callbacks for given type of kevent
 * trailing whitespace cleanup

Changes from 'take12' patchset:
 * remove non-chardev interface for initialization
 * use pointer to kevent_mring instead of unsigned longs
 * use aligned 64bit type in raw user data (can be used by high-res timer if needed)
 * simplified enqueue/dequeue callbacks and kevent initialization
 * use nanoseconds for timeout
 * put number of milliseconds into timer's return data
 * move some definitions into user-visible header
 * removed filenames from comments

Changes from 'take11' patchset:
 * include missing headers into patchset
 * some trivial code cleanups (use goto instead of if/else games and so on)
 * some whitespace cleanups
 * check for ready_callback() callback before main loop which should save us some ticks

Changes from 'take10' patchset:
 * removed non-existent prototypes
 * added helper function for kevent_registered_callbacks
 * fixed 80 lines comments issues
 * added shared between userspace and kernelspace header instead of embedd them in one
 * core restructuring to remove forward declarations
 * s o m e w h i t e s p a c e c o d y n g s t y l e c l e a n u p
 * use vm_insert_page() instead of remap_pfn_range()

Changes from 'take9' patchset:
 * fixed ->nopage method

Changes from 'take8' patchset:
 * fixed mmap release bug
 * use module_init() instead of late_initcall()
 * use better structures for timer notifications

Changes from 'take7' patchset:
 * new mmap interface (not tested, waiting for other changes to be acked)
	- use nopage() method to dynamically substitue pages
	- allocate new page for events only when new added kevent requres it
	- do not use ugly index dereferencing, use structure instead
	- reduced amount of data in the ring (id and flags), 
		maximum 12 pages on x86 per kevent fd

Changes from 'take6' patchset:
 * a lot of comments!
 * do not use list poisoning for detection of the fact, that entry is in the list
 * return number of ready kevents even if copy*user() fails
 * strict check for number of kevents in syscall
 * use ARRAY_SIZE for array size calculation
 * changed superblock magic number
 * use SLAB_PANIC instead of direct panic() call
 * changed -E* return values
 * a lot of small cleanups and indent fixes

Changes from 'take5' patchset:
 * removed compilation warnings about unused wariables when lockdep is not turned on
 * do not use internal socket structures, use appropriate (exported) wrappers instead
 * removed default 1 second timeout
 * removed AIO stuff from patchset

Changes from 'take4' patchset:
 * use miscdevice instead of chardevice
 * comments fixes

Changes from 'take3' patchset:
 * removed serializing mutex from kevent_user_wait()
 * moved storage list processing to RCU
 * removed lockdep screaming - all storage locks are initialized in the same function, so it was
learned 
	to differentiate between various cases
 * remove kevent from storage if is marked as broken after callback
 * fixed a typo in mmaped buffer implementation which would end up in wrong index calcualtion 

Changes from 'take2' patchset:
 * split kevent_finish_user() to locked and unlocked variants
 * do not use KEVENT_STAT ifdefs, use inline functions instead
 * use array of callbacks of each type instead of each kevent callback initialization
 * changed name of ukevent guarding lock
 * use only one kevent lock in kevent_user for all hash buckets instead of per-bucket locks
 * do not use kevent_user_ctl structure instead provide needed arguments as syscall parameters
 * various indent cleanups
 * added optimisation, which is aimed to help when a lot of kevents are being copied from
userspace
 * mapped buffer (initial) implementation (no userspace yet)

Changes from 'take1' patchset:
 - rebased against 2.6.18-git tree
 - removed ioctl controlling
 - added new syscall kevent_get_events(int fd, unsigned int min_nr, unsigned int max_nr,
			unsigned int timeout, void __user *buf, unsigned flags)
 - use old syscall kevent_ctl for creation/removing, modification and initial kevent 
	initialization
 - use mutuxes instead of semaphores
 - added file descriptor check and return error if provided descriptor does not match
	kevent file operations
 - various indent fixes
 - removed aio_sendfile() declarations.

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>



^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 3/10] kevent: poll/select() notifications.
  2007-01-08 19:25     ` [take31 2/10] kevent: Core files Evgeniy Polyakov
@ 2007-01-08 19:25       ` Evgeniy Polyakov
  2007-01-08 19:25         ` [take31 4/10] kevent: Socket notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


poll/select() notifications.

This patch includes generic poll/select notifications.
kevent_poll works simialr to epoll and has the same issues (callback
is invoked not from internal state machine of the caller, but through
process awake, a lot of allocations and so on).

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mitp.ru>

diff --git a/fs/file_table.c b/fs/file_table.c
index 4c17a18..46f458c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/kevent.h>
 #include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
@@ -119,6 +120,7 @@ struct file *get_empty_filp(void)
 	f->f_uid = tsk->fsuid;
 	f->f_gid = tsk->fsgid;
 	eventpoll_init_file(f);
+	kevent_init_file(f);
 	/* f->f_version: 0 */
 	return f;
 
@@ -164,6 +166,7 @@ void fastcall __fput(struct file *file)
 	 * in the file cleanup chain.
 	 */
 	eventpoll_release(file);
+	kevent_cleanup_file(file);
 	locks_remove_flock(file);
 
 	if (file->f_op && file->f_op->release)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 186da81..59e6069 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -280,6 +280,7 @@ extern int dir_notify_enable;
 #include <linux/init.h>
 #include <linux/pid.h>
 #include <linux/mutex.h>
+#include <linux/kevent_storage.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -408,6 +409,8 @@ struct address_space_operations {
 
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
+	int (*aio_readpages)(struct file *filp, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages, void *priv);
 
 	/*
 	 * ext3 requires that a successful prepare_write() call be followed
@@ -578,6 +581,10 @@ struct inode {
 	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
+#if defined CONFIG_KEVENT_SOCKET || defined CONFIG_KEVENT_PIPE
+	struct kevent_storage	st;
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -737,6 +744,9 @@ struct file {
 	struct list_head	f_ep_links;
 	spinlock_t		f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
+#ifdef CONFIG_KEVENT_POLL
+	struct kevent_storage	st;
+#endif
 	struct address_space	*f_mapping;
 };
 extern spinlock_t files_lock;
diff --git a/kernel/kevent/kevent_poll.c b/kernel/kevent/kevent_poll.c
new file mode 100644
index 0000000..58129fa
--- /dev/null
+++ b/kernel/kevent/kevent_poll.c
@@ -0,0 +1,234 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/file.h>
+#include <linux/kevent.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+
+static struct kmem_cache *kevent_poll_container_cache;
+static struct kmem_cache *kevent_poll_priv_cache;
+
+struct kevent_poll_ctl
+{
+	struct poll_table_struct 	pt;
+	struct kevent			*k;
+};
+
+struct kevent_poll_wait_container
+{
+	struct list_head		container_entry;
+	wait_queue_head_t		*whead;
+	wait_queue_t			wait;
+	struct kevent			*k;
+};
+
+struct kevent_poll_private
+{
+	struct list_head		container_list;
+	spinlock_t			container_lock;
+};
+
+static int kevent_poll_enqueue(struct kevent *k);
+static int kevent_poll_dequeue(struct kevent *k);
+static int kevent_poll_callback(struct kevent *k);
+
+static int kevent_poll_wait_callback(wait_queue_t *wait,
+		unsigned mode, int sync, void *key)
+{
+	struct kevent_poll_wait_container *cont =
+		container_of(wait, struct kevent_poll_wait_container, wait);
+	struct kevent *k = cont->k;
+
+	kevent_storage_ready(k->st, NULL, KEVENT_MASK_ALL);
+	return 0;
+}
+
+static void kevent_poll_qproc(struct file *file, wait_queue_head_t *whead,
+		struct poll_table_struct *poll_table)
+{
+	struct kevent *k =
+		container_of(poll_table, struct kevent_poll_ctl, pt)->k;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *cont;
+	unsigned long flags;
+
+	cont = kmem_cache_alloc(kevent_poll_container_cache, GFP_KERNEL);
+	if (!cont) {
+		kevent_break(k);
+		return;
+	}
+
+	cont->k = k;
+	init_waitqueue_func_entry(&cont->wait, kevent_poll_wait_callback);
+	cont->whead = whead;
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_add_tail(&cont->container_entry, &priv->container_list);
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	add_wait_queue(whead, &cont->wait);
+}
+
+static int kevent_poll_enqueue(struct kevent *k)
+{
+	struct file *file;
+	int err;
+	unsigned int revents;
+	unsigned long flags;
+	struct kevent_poll_ctl ctl;
+	struct kevent_poll_private *priv;
+
+	file = fget(k->event.id.raw[0]);
+	if (!file)
+		return -EBADF;
+	
+	err = -EINVAL;
+	if (!file->f_op || !file->f_op->poll)
+		goto err_out_fput;
+	
+	err = -ENOMEM;
+	priv = kmem_cache_alloc(kevent_poll_priv_cache, GFP_KERNEL);
+	if (!priv)
+		goto err_out_fput;
+
+	spin_lock_init(&priv->container_lock);
+	INIT_LIST_HEAD(&priv->container_list);
+
+	k->priv = priv;
+
+	ctl.k = k;
+	init_poll_funcptr(&ctl.pt, &kevent_poll_qproc);
+	
+	err = kevent_storage_enqueue(&file->st, k);
+	if (err)
+		goto err_out_free;
+
+	revents = file->f_op->poll(file, &ctl.pt);
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+	} else {
+		if (revents & k->event.event) {
+			err = 1;
+			goto out_dequeue;
+		}
+	}
+
+	spin_lock_irqsave(&k->ulock, flags);
+	k->event.req_flags |= KEVENT_REQ_LAST_CHECK;
+	spin_unlock_irqrestore(&k->ulock, flags);
+
+	return 0;
+
+out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_free:
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+err_out_fput:
+	fput(file);
+	return err;
+}
+
+static int kevent_poll_dequeue(struct kevent *k)
+{
+	struct file *file = k->st->origin;
+	struct kevent_poll_private *priv = k->priv;
+	struct kevent_poll_wait_container *w, *n;
+	unsigned long flags;
+
+	kevent_storage_dequeue(k->st, k);
+
+	spin_lock_irqsave(&priv->container_lock, flags);
+	list_for_each_entry_safe(w, n, &priv->container_list, container_entry) {
+		list_del(&w->container_entry);
+		remove_wait_queue(w->whead, &w->wait);
+		kmem_cache_free(kevent_poll_container_cache, w);
+	}
+	spin_unlock_irqrestore(&priv->container_lock, flags);
+
+	kmem_cache_free(kevent_poll_priv_cache, priv);
+	k->priv = NULL;
+
+	fput(file);
+
+	return 0;
+}
+
+static int kevent_poll_callback(struct kevent *k)
+{
+	if (k->event.req_flags & KEVENT_REQ_LAST_CHECK) {
+		return 1;
+	} else {
+		struct file *file = k->st->origin;
+		unsigned int revents = file->f_op->poll(file, NULL);
+
+		k->event.ret_data[0] = revents & k->event.event;
+
+		return (revents & k->event.event);
+	}
+}
+
+static int __init kevent_poll_sys_init(void)
+{
+	struct kevent_callbacks pc = {
+		.callback = &kevent_poll_callback,
+		.enqueue = &kevent_poll_enqueue,
+		.dequeue = &kevent_poll_dequeue,
+		.flags = 0,
+	};
+
+	kevent_poll_container_cache = kmem_cache_create("kevent_poll_container_cache",
+			sizeof(struct kevent_poll_wait_container), 0, 0, NULL, NULL);
+	if (!kevent_poll_container_cache) {
+		printk(KERN_ERR "Failed to create kevent poll container cache.\n");
+		return -ENOMEM;
+	}
+
+	kevent_poll_priv_cache = kmem_cache_create("kevent_poll_priv_cache",
+			sizeof(struct kevent_poll_private), 0, 0, NULL, NULL);
+	if (!kevent_poll_priv_cache) {
+		printk(KERN_ERR "Failed to create kevent poll private data cache.\n");
+		kmem_cache_destroy(kevent_poll_container_cache);
+		kevent_poll_container_cache = NULL;
+		return -ENOMEM;
+	}
+
+	kevent_add_callbacks(&pc, KEVENT_POLL);
+
+	printk(KERN_INFO "Kevent poll()/select() subsystem has been initialized.\n");
+	return 0;
+}
+
+static struct lock_class_key kevent_poll_key;
+
+void kevent_poll_reinit(struct file *file)
+{
+	lockdep_set_class(&file->st.lock, &kevent_poll_key);
+}
+
+static void __exit kevent_poll_sys_fini(void)
+{
+	kmem_cache_destroy(kevent_poll_priv_cache);
+	kmem_cache_destroy(kevent_poll_container_cache);
+}
+
+module_init(kevent_poll_sys_init);
+module_exit(kevent_poll_sys_fini);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 5/10] kevent: Timer notifications.
  2007-01-08 19:25         ` [take31 4/10] kevent: Socket notifications Evgeniy Polyakov
@ 2007-01-08 19:25           ` Evgeniy Polyakov
  2007-01-08 19:25             ` [take31 6/10] kevent: Pipe notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Timer notifications.

Timer notifications can be used for fine grained per-process time 
management, since interval timers are very inconvenient to use, 
and they are limited.

This subsystem uses high-resolution timers.
id.raw[0] is used as number of seconds
id.raw[1] is used as number of nanoseconds

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/kernel/kevent/kevent_timer.c b/kernel/kevent/kevent_timer.c
new file mode 100644
index 0000000..c21a155
--- /dev/null
+++ b/kernel/kevent/kevent_timer.c
@@ -0,0 +1,114 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/kevent.h>
+
+struct kevent_timer
+{
+	struct hrtimer		ktimer;
+	struct kevent_storage	ktimer_storage;
+	struct kevent		*ktimer_event;
+};
+
+static int kevent_timer_func(struct hrtimer *timer)
+{
+	struct kevent_timer *t = container_of(timer, struct kevent_timer, ktimer);
+	struct kevent *k = t->ktimer_event;
+
+	kevent_storage_ready(&t->ktimer_storage, NULL, KEVENT_MASK_ALL);
+	hrtimer_forward(timer, timer->base->softirq_time,
+			ktime_set(k->event.id.raw[0], k->event.id.raw[1]));
+	return HRTIMER_RESTART;
+}
+
+static struct lock_class_key kevent_timer_key;
+
+static int kevent_timer_enqueue(struct kevent *k)
+{
+	int err;
+	struct kevent_timer *t;
+
+	t = kmalloc(sizeof(struct kevent_timer), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	hrtimer_init(&t->ktimer, CLOCK_MONOTONIC, HRTIMER_REL);
+	t->ktimer.expires = ktime_set(k->event.id.raw[0], k->event.id.raw[1]);
+	t->ktimer.function = kevent_timer_func;
+	t->ktimer_event = k;
+
+	err = kevent_storage_init(&t->ktimer, &t->ktimer_storage);
+	if (err)
+		goto err_out_free;
+	lockdep_set_class(&t->ktimer_storage.lock, &kevent_timer_key);
+
+	err = kevent_storage_enqueue(&t->ktimer_storage, k);
+	if (err)
+		goto err_out_st_fini;
+
+	hrtimer_start(&t->ktimer, t->ktimer.expires, HRTIMER_REL);
+
+	return 0;
+
+err_out_st_fini:
+	kevent_storage_fini(&t->ktimer_storage);
+err_out_free:
+	kfree(t);
+
+	return err;
+}
+
+static int kevent_timer_dequeue(struct kevent *k)
+{
+	struct kevent_storage *st = k->st;
+	struct kevent_timer *t = container_of(st, struct kevent_timer, ktimer_storage);
+
+	hrtimer_cancel(&t->ktimer);
+	kevent_storage_dequeue(st, k);
+	kfree(t);
+
+	return 0;
+}
+
+static int kevent_timer_callback(struct kevent *k)
+{
+	k->event.ret_data[0] = jiffies_to_msecs(jiffies);
+	return 1;
+}
+
+static int __init kevent_init_timer(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &kevent_timer_callback,
+		.enqueue = &kevent_timer_enqueue,
+		.dequeue = &kevent_timer_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&tc, KEVENT_TIMER);
+}
+module_init(kevent_init_timer);
+


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 6/10] kevent: Pipe notifications.
  2007-01-08 19:25           ` [take31 5/10] kevent: Timer notifications Evgeniy Polyakov
@ 2007-01-08 19:25             ` Evgeniy Polyakov
  2007-01-08 19:25               ` [take31 7/10] kevent: Signal notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:25 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Pipe notifications.


diff --git a/fs/pipe.c b/fs/pipe.c
index 68090e8..0c75bf1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -16,6 +16,7 @@
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/kevent.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -313,6 +314,7 @@ redo:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 			wake_up_interruptible_sync(&pipe->wait);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
@@ -322,6 +324,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
@@ -484,6 +487,7 @@ redo2:
 			break;
 		}
 		if (do_wakeup) {
+			kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 			wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
@@ -495,6 +499,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
+		kevent_pipe_notify(inode, KEVENT_SOCKET_RECV);
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
@@ -590,6 +595,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 		free_pipe_info(inode);
 	} else {
 		wake_up_interruptible(&pipe->wait);
+		kevent_pipe_notify(inode, KEVENT_SOCKET_SEND|KEVENT_SOCKET_RECV);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
diff --git a/kernel/kevent/kevent_pipe.c b/kernel/kevent/kevent_pipe.c
new file mode 100644
index 0000000..91dc1eb
--- /dev/null
+++ b/kernel/kevent/kevent_pipe.c
@@ -0,0 +1,123 @@
+/*
+ * 	kevent_pipe.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kevent.h>
+#include <linux/pipe_fs_i.h>
+
+static int kevent_pipe_callback(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int nrbufs = pipe->nrbufs;
+
+	if (k->event.event & KEVENT_SOCKET_RECV && nrbufs > 0) {
+		if (!pipe->writers)
+			return -1;
+		return 1;
+	}
+	
+	if (k->event.event & KEVENT_SOCKET_SEND && nrbufs < PIPE_BUFFERS) {
+		if (!pipe->readers)
+			return -1;
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_pipe_enqueue(struct kevent *k)
+{
+	struct file *pipe;
+	int err = -EBADF;
+	struct inode *inode;
+
+	pipe = fget(k->event.id.raw[0]);
+	if (!pipe)
+		goto err_out_exit;
+
+	inode = igrab(pipe->f_dentry->d_inode);
+	if (!inode)
+		goto err_out_fput;
+
+	err = -EINVAL;
+	if (!S_ISFIFO(inode->i_mode))
+		goto err_out_iput;
+
+	err = kevent_storage_enqueue(&inode->st, k);
+	if (err)
+		goto err_out_iput;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE) {
+		kevent_requeue(k);
+		err = 0;
+	} else {
+		err = k->callbacks.callback(k);
+		if (err)
+			goto err_out_dequeue;
+	}
+
+	fput(pipe);
+
+	return err;
+
+err_out_dequeue:
+	kevent_storage_dequeue(k->st, k);
+err_out_iput:
+	iput(inode);
+err_out_fput:
+	fput(pipe);
+err_out_exit:
+	return err;
+}
+
+int kevent_pipe_dequeue(struct kevent *k)
+{
+	struct inode *inode = k->st->origin;
+
+	kevent_storage_dequeue(k->st, k);
+	iput(inode);
+
+	return 0;
+}
+
+void kevent_pipe_notify(struct inode *inode, u32 event)
+{
+	kevent_storage_ready(&inode->st, NULL, event);
+}
+
+static int __init kevent_init_pipe(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_pipe_callback,
+		.enqueue = &kevent_pipe_enqueue,
+		.dequeue = &kevent_pipe_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_PIPE);
+}
+module_init(kevent_init_pipe);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 9/10] kevent: Private userspace notifications.
  2007-01-08 19:26                 ` [take31 8/10] kevent: Kevent posix timer notifications Evgeniy Polyakov
@ 2007-01-08 19:26                   ` Evgeniy Polyakov
  2007-01-08 19:26                     ` [take31 10/10] kevent: Kevent based AIO (aio_sendfile()) Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:26 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Private userspace notifications.

Allows to register notifications of any private userspace
events over kevent. Events can be marked as readt using 
kevent_ctl(KEVENT_READY) command.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/kernel/kevent/kevent_unotify.c b/kernel/kevent/kevent_unotify.c
new file mode 100644
index 0000000..618c09c
--- /dev/null
+++ b/kernel/kevent/kevent_unotify.c
@@ -0,0 +1,62 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/kevent.h>
+
+static int kevent_unotify_callback(struct kevent *k)
+{
+	return 1;
+}
+
+int kevent_unotify_enqueue(struct kevent *k)
+{
+	int err;
+
+	err = kevent_storage_enqueue(&k->user->st, k);
+	if (err)
+		goto err_out_exit;
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE)
+		kevent_requeue(k);
+
+	return 0;
+
+err_out_exit:
+	return err;
+}
+
+int kevent_unotify_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+	return 0;
+}
+
+static int __init kevent_init_unotify(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_unotify_callback,
+		.enqueue = &kevent_unotify_enqueue,
+		.dequeue = &kevent_unotify_dequeue,
+		.flags = 0,
+	};
+
+	return kevent_add_callbacks(&sc, KEVENT_UNOTIFY);
+}
+module_init(kevent_init_unotify);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 10/10] kevent: Kevent based AIO (aio_sendfile()).
  2007-01-08 19:26                   ` [take31 9/10] kevent: Private userspace notifications Evgeniy Polyakov
@ 2007-01-08 19:26                     ` Evgeniy Polyakov
  0 siblings, 0 replies; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:26 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar, linux-fsdevel


Kevent based AIO (aio_sendfile()).

aio_sendfile() contains of two major parts: AIO state machine and page 
processing code. The former is just a small subsystem, which allows to 
queue callback for theirs invocation in process' context on behalf of 
pool of kernel threads. It allows to queue caches of callbacks to the 
local thread or to any other specified. Each cache of callbacks is 
processed until there are callbacks in it, callbacks can requeue 
themselfs into the same cache.

Real work is being done in page processing code - code which populates 
pages into VFS cache and then sends pages to the destination socket 
via ->sendpage(). Unlike previous aio_sendfile() implementation, new 
one does not require low-level filesystem specific callbacks (->get_block())
at all, instead I extended struct address_space_operations to contain new 
member called ->aio_readpages(), which is exactly the same as ->readpage() 
(read: mpage_readpages()) except different BIO allocation and sumbission 
routines. I changed mpage_readpages() to provide mpage_alloc() and 
mpage_bio_submit() to the new function called __mpage_readpages(), which is 
exactly old mpage_readpages() with provided callback invocation instead of 
usage for old functions. mpage_readpages_aio() provides kevent specific 
callbacks, which calls old functions, but with different destructor callbacks,
which are essentially the same, except that they reschedule AIO processing.

Benchmarks of simple sendfile vs. aio_sendfile did not show noticeble
win of any approach, but I want to notice, that my receiving side is 
not that best in my case (I managed to create a test where usual
senfile() stuck until signal received, the same hapend with aio_sendfile()
too, but the latter can be buggy).
It would be good to use it in lighttpd (I will create a patch after
some feedback received and approach will be considered as good.

AIO state machine is a base for network AIO (which becomes
quite trivial), but I will not start implementation until
roadback of kevent as a whole and AIO implementation become more clear.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/fs/bio.c b/fs/bio.c
index 7618bcb..291e7e8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -120,7 +120,7 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
 /*
  * default destructor for a bio allocated with bio_alloc_bioset()
  */
-static void bio_fs_destructor(struct bio *bio)
+void bio_fs_destructor(struct bio *bio)
 {
 	bio_free(bio, fs_bio_set);
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index beaf25f..f08c957 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1650,6 +1650,13 @@ ext3_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
 
+static int
+ext3_readpages_aio(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages, void *priv)
+{
+	return mpage_readpages_aio(mapping, pages, nr_pages, ext3_get_block, priv);
+}
+
 static void ext3_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
@@ -1768,6 +1775,7 @@ static int ext3_journalled_set_page_dirty(struct page *page)
 }
 
 static const struct address_space_operations ext3_ordered_aops = {
+	.aio_readpages	= ext3_readpages_aio,
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
diff --git a/fs/mpage.c b/fs/mpage.c
index 692a3e5..e5ba44b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -102,7 +102,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 static struct bio *
 mpage_alloc(struct block_device *bdev,
 		sector_t first_sector, int nr_vecs,
-		gfp_t gfp_flags)
+		gfp_t gfp_flags, void *priv)
 {
 	struct bio *bio;
 
@@ -116,6 +116,7 @@ mpage_alloc(struct block_device *bdev,
 	if (bio) {
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
+		bio->bi_private = priv;
 	}
 	return bio;
 }
@@ -175,7 +176,10 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
 static struct bio *
 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 		sector_t *last_block_in_bio, struct buffer_head *map_bh,
-		unsigned long *first_logical_block, get_block_t get_block)
+		unsigned long *first_logical_block, get_block_t get_block,
+		struct bio *(*alloc)(struct block_device *bdev, sector_t first_sector, 
+			int nr_vecs, gfp_t gfp_flags, void *priv),
+		struct bio *(*submit)(int rw, struct bio *bio), void *priv)
 {
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
@@ -302,25 +306,25 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (bio && (*last_block_in_bio != blocks[0] - 1))
-		bio = mpage_bio_submit(READ, bio);
+		bio = submit(READ, bio);
 
 alloc_new:
 	if (bio == NULL) {
-		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+		bio = alloc(bdev, blocks[0] << (blkbits - 9),
 			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
-				GFP_KERNEL);
+				GFP_KERNEL, priv);
 		if (bio == NULL)
 			goto confused;
 	}
 
 	length = first_hole << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
-		bio = mpage_bio_submit(READ, bio);
+		bio = submit(READ, bio);
 		goto alloc_new;
 	}
 
 	if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
-		bio = mpage_bio_submit(READ, bio);
+		bio = submit(READ, bio);
 	else
 		*last_block_in_bio = blocks[blocks_per_page - 1];
 out:
@@ -328,7 +332,7 @@ out:
 
 confused:
 	if (bio)
-		bio = mpage_bio_submit(READ, bio);
+		bio = submit(READ, bio);
 	if (!PageUptodate(page))
 	        block_read_full_page(page, get_block);
 	else
@@ -336,6 +340,48 @@ confused:
 	goto out;
 }
 
+int
+__mpage_readpages(struct address_space *mapping, struct list_head *pages,
+				unsigned nr_pages, get_block_t get_block,
+		struct bio *(*alloc)(struct block_device *bdev, sector_t first_sector, 
+			int nr_vecs, gfp_t gfp_flags, void *priv),
+		struct bio *(*submit)(int rw, struct bio *bio), 
+		void *priv)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	sector_t last_block_in_bio = 0;
+	struct pagevec lru_pvec;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+
+	clear_buffer_mapped(&map_bh);
+	pagevec_init(&lru_pvec, 0);
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		if (!add_to_page_cache(page, mapping,
+					page->index, GFP_KERNEL)) {
+			bio = do_mpage_readpage(bio, page,
+					nr_pages - page_idx,
+					&last_block_in_bio, &map_bh,
+					&first_logical_block,
+					get_block, alloc, submit, priv);
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+		} else {
+			page_cache_release(page);
+		}
+	}
+	pagevec_lru_add(&lru_pvec);
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		submit(READ, bio);
+	return 0;
+}
+
 /**
  * mpage_readpages - populate an address space with some pages, and
  *                       start reads against them.
@@ -386,40 +432,28 @@ int
 mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block)
 {
-	struct bio *bio = NULL;
-	unsigned page_idx;
-	sector_t last_block_in_bio = 0;
-	struct pagevec lru_pvec;
-	struct buffer_head map_bh;
-	unsigned long first_logical_block = 0;
+	return __mpage_readpages(mapping, pages, nr_pages, get_block,
+			mpage_alloc, mpage_bio_submit, NULL);
+}
+EXPORT_SYMBOL(mpage_readpages);
 
-	clear_buffer_mapped(&map_bh);
-	pagevec_init(&lru_pvec, 0);
-	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page = list_entry(pages->prev, struct page, lru);
+#ifdef CONFIG_KEVENT_AIO
+extern struct bio *kaio_mpage_alloc(struct block_device *bdev, sector_t first_sector, 
+		int nr_vecs, gfp_t gfp_flags, void *priv);
+extern struct bio *kaio_mpage_bio_submit(int rw, struct bio *bio);
+#else
+#define kaio_mpage_alloc	mpage_alloc
+#define kaio_mpage_bio_submit	mpage_bio_submit
+#endif
 
-		prefetchw(&page->flags);
-		list_del(&page->lru);
-		if (!add_to_page_cache(page, mapping,
-					page->index, GFP_KERNEL)) {
-			bio = do_mpage_readpage(bio, page,
-					nr_pages - page_idx,
-					&last_block_in_bio, &map_bh,
-					&first_logical_block,
-					get_block);
-			if (!pagevec_add(&lru_pvec, page))
-				__pagevec_lru_add(&lru_pvec);
-		} else {
-			page_cache_release(page);
-		}
-	}
-	pagevec_lru_add(&lru_pvec);
-	BUG_ON(!list_empty(pages));
-	if (bio)
-		mpage_bio_submit(READ, bio);
-	return 0;
+int
+mpage_readpages_aio(struct address_space *mapping, struct list_head *pages,
+				unsigned nr_pages, get_block_t get_block, void *priv)
+{
+	return __mpage_readpages(mapping, pages, nr_pages, get_block,
+			kaio_mpage_alloc, kaio_mpage_bio_submit, priv);
 }
-EXPORT_SYMBOL(mpage_readpages);
+EXPORT_SYMBOL(mpage_readpages_aio);
 
 /*
  * This isn't called much at all
@@ -433,7 +467,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
 
 	clear_buffer_mapped(&map_bh);
 	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
-			&map_bh, &first_logical_block, get_block);
+			&map_bh, &first_logical_block, get_block,
+			mpage_alloc, mpage_bio_submit, NULL);
 	if (bio)
 		mpage_bio_submit(READ, bio);
 	return 0;
@@ -595,7 +630,7 @@ page_is_mapped:
 alloc_new:
 	if (bio == NULL) {
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
+				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH, NULL);
 		if (bio == NULL)
 			goto confused;
 	}
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index cc5fb75..accdbdd 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -16,6 +16,8 @@ typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
 
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
+int mpage_readpages_aio(struct address_space *mapping, struct list_head *pages,
+				unsigned nr_pages, get_block_t get_block, void *priv);
 int mpage_readpage(struct page *page, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
diff --git a/kernel/kevent/kevent_aio.c b/kernel/kevent/kevent_aio.c
new file mode 100644
index 0000000..bf51963
--- /dev/null
+++ b/kernel/kevent/kevent_aio.c
@@ -0,0 +1,752 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/kevent.h>
+
+#define KAIO_CALL_NUM		8
+#define KAIO_THREAD_NUM		8
+
+//#define KEVENT_AIO_DEBUG
+
+#ifdef KEVENT_AIO_DEBUG
+#define dprintk(f, a...) printk(f, ##a)
+#else
+#define dprintk(f, a...) do {} while (0)
+#endif
+
+struct kaio_thread
+{
+	struct list_head	req_list;
+	spinlock_t		req_lock;
+	struct task_struct	*thread;
+	int			refcnt;
+	wait_queue_head_t 	wait;
+};
+
+/*
+ * Array of working threads.
+ * It can only be accessed under RCU protection, 
+ * so threads reference counters are not atomic.
+ */
+static struct kaio_thread *kaio_threads[KAIO_THREAD_NUM] __read_mostly;
+static struct kmem_cache *kaio_req_cache __read_mostly;
+static struct kmem_cache *kaio_priv_cache __read_mostly;
+
+struct kaio_req;
+typedef int (* kaio_callback)(struct kaio_req *req, int direct);
+
+#define KAIO_REQ_PENDING	0
+
+/*
+ * Cache of kaio request callbacks.
+ * It is not allowed to change the same cache entry
+ * simultaneously (for example it is forbidden to add entries
+ * in parallel).
+ *
+ * When cache entry is scheduled for execution in one of the threads,
+ * it is forbidden to access it, since it will be freed when
+ * all callbacks have been invoked.
+ *
+ * It is possible to add callbacks into this cache from callbacks itself.
+ */
+struct kaio_req
+{
+	struct list_head	req_entry;
+	kaio_callback		call[KAIO_CALL_NUM];
+	int			read_idx, add_idx;
+	int			cpu;
+	long			flags;
+	void			(*destructor)(struct kaio_req *);
+	void			*priv;
+};
+
+/*
+ * Returns pointer to thread entry for given index.
+ * Must be called under RCU protection.
+ */
+static inline struct kaio_thread *kaio_get_thread(int cpu)
+{
+	struct kaio_thread *th;
+
+	if (cpu == -1)
+		cpu = smp_processor_id();
+
+	if (unlikely(cpu >= KAIO_THREAD_NUM || !kaio_threads[cpu]))
+		return NULL;
+
+	th = kaio_threads[cpu];
+	th->refcnt++;
+
+	return th;
+}
+
+/*
+ * Drops reference counter for given thread.
+ * Must be called under RCU protection.
+ */
+static inline void kaio_put_thread(struct kaio_thread *th)
+{
+	th->refcnt--;
+}
+
+void kaio_schedule_req(struct kaio_req *req)
+{
+	if (!test_and_set_bit(KAIO_REQ_PENDING, &req->flags)) {
+		struct kaio_thread *th;
+		unsigned long flags;
+
+		rcu_read_lock();
+		th = kaio_get_thread(req->cpu);
+		if (!th) {
+			req->cpu = -1;
+			th = kaio_get_thread(-1);
+			BUG_ON(!th);
+		}
+
+		spin_lock_irqsave(&th->req_lock, flags);
+		list_add_tail(&req->req_entry, &th->req_list);
+		spin_unlock_irqrestore(&th->req_lock, flags);
+
+		wake_up(&th->wait);
+
+		kaio_put_thread(th);
+		rcu_read_unlock();
+	}
+}
+
+EXPORT_SYMBOL_GPL(kaio_schedule_req);
+
+/*
+ * Append a call request into cache.
+ * Returns -EOVERFLOW in case cache is full, and 0 otherwise.
+ */
+int kaio_append_call(struct kaio_req *req, kaio_callback call, int cpu)
+{
+	if ((req->add_idx + 1 == req->read_idx) ||
+			((req->add_idx + 1 == KAIO_CALL_NUM) && req->read_idx == 0))
+		return -EOVERFLOW;
+
+	req->cpu = cpu;
+	req->call[req->add_idx] = call;
+
+	dprintk("%s: req: %p, read_idx: %d, add_idx: %d, call: %p [%p].\n",
+			__func__, req, req->read_idx, req->add_idx, 
+			req->call[req->read_idx], req->call[req->add_idx]);
+	if (++req->add_idx == KAIO_CALL_NUM)
+		req->add_idx = 0;
+	
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(kaio_append_call);
+
+/*
+ * Adds one call request into given cache.
+ * If cache is NULL or full, allocate new one.
+ */
+struct kaio_req *kaio_add_call(struct kaio_req *req, kaio_callback call, int cpu, gfp_t gflags)
+{
+	if (req && !kaio_append_call(req, call, cpu)) {
+		kaio_schedule_req(req);
+		return req;
+	}
+
+	req = kmem_cache_alloc(kaio_req_cache, gflags);
+	if (!req)
+		return NULL;
+
+	memset(req->call, 0, sizeof(req->call));
+
+	req->destructor = NULL;
+	req->cpu = cpu;
+	req->call[0] = call;
+	req->add_idx = 1;
+	req->read_idx = 0;
+	req->flags = 0;
+
+	dprintk("%s: req: %p, call: %p [%p].\n", __func__, req, call, req->call[0]);
+
+	return req;
+}
+
+EXPORT_SYMBOL_GPL(kaio_add_call);
+
+/*
+ * Call appropriate callbacks in cache.
+ * This can only be called by working threads, which means that cache
+ * is filled (probably partially) and are not even accessible from
+ * the originator of requests, which means that cache will be freed
+ * when all callbacks are invoked.
+ *
+ * Callback itself can reschedule new callback into the same cache.
+ *
+ * If callback returns negative value, the whole cache will be freed.
+ * If positive value is returned, then further processing is stopped, 
+ * so cache can be queued into the end of the processing FIFO by callback.
+ * If zero is returned, next callback will be invoked if any.
+ */
+static int kaio_call(struct kaio_req *req)
+{
+	int err = -EINVAL;
+
+	if (likely(req->add_idx != req->read_idx)) {
+		dprintk("%s: req: %p, read_idx: %d, add_idx: %d, call: %p [%p].\n",
+				__func__, req, req->read_idx, req->add_idx, 
+				req->call[req->read_idx], req->call[0]);
+		err = (*req->call[req->read_idx])(req, 0);
+		if (++req->read_idx == KAIO_CALL_NUM)
+			req->read_idx = 0;
+	}
+	return err;
+}
+
+static int kaio_thread_process(void *data)
+{
+	struct kaio_thread *th = data;
+	unsigned long flags;
+	struct kaio_req *req;
+	int err;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible_timeout(th->wait, !list_empty(&th->req_list), HZ);
+
+		req = NULL;
+		spin_lock_irqsave(&th->req_lock, flags);
+		if (!list_empty(&th->req_list)) {
+			req = list_entry(th->req_list.prev, struct kaio_req, req_entry);
+			list_del(&req->req_entry);
+		}
+		spin_unlock_irqrestore(&th->req_lock, flags);
+
+		if (req) {
+			err = 0;
+			while ((req->read_idx != req->add_idx) && !kthread_should_stop()) {
+				err = kaio_call(req);
+				if (err != 0)
+					break;
+				dprintk("%s: req: %p, read_idx: %d, add_idx: %d, err: %d.\n",
+						__func__, req, req->read_idx, req->add_idx, err);
+			}
+			if (err < 0 || req->read_idx == req->add_idx) {
+				dprintk("%s: freeing req: %p, priv: %p.\n", __func__, req, req->priv);
+				if (req->destructor)
+					req->destructor(req);
+				kmem_cache_free(kaio_req_cache, req);
+			} else if (err > 0) {
+				clear_bit(KAIO_REQ_PENDING, &req->flags);
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct kaio_private
+{
+	union {
+		void 		*sptr;
+		__u64		sdata;
+	};
+	union {
+		void 		*dptr;
+		__u64		ddata;
+	};
+	__u64			offset, processed;
+	__u64			count;
+	struct kevent_user	*kevent_user;
+};
+
+extern void bio_fs_destructor(struct bio *bio);
+
+static void kaio_bio_destructor(struct bio *bio)
+{
+	dprintk("%s: bio=%p, num=%u.\n", __func__, bio, bio->bi_vcnt);
+	bio_fs_destructor(bio);
+}
+
+static int kaio_mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct kaio_req *req = bio->bi_private;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		dprintk("%s: page: %p, uptodate: %d.\n", __func__, page, uptodate);
+
+		if (uptodate) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	} while (bvec >= bio->bi_io_vec);
+
+	dprintk("%s: bio: %p, req: %p, pending: %d.\n", 
+			__func__, bio, req, test_bit(KAIO_REQ_PENDING, &req->flags));
+	kaio_schedule_req(req);
+
+	bio_put(bio);
+	return 0;
+}
+
+struct bio *kaio_mpage_bio_submit(int rw, struct bio *bio)
+{
+	if (bio) {
+		bio->bi_end_io = kaio_mpage_end_io_read;
+		dprintk("%s: bio=%p, num=%u.\n", __func__, bio, bio->bi_vcnt);
+		submit_bio(READ, bio);
+	}
+	return NULL;
+}
+
+struct bio *kaio_mpage_alloc(struct block_device *bdev,
+		sector_t first_sector, int nr_vecs, gfp_t gfp_flags, void *priv)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_sector;
+		bio->bi_private = priv;
+		bio->bi_destructor = kaio_bio_destructor;
+		dprintk("%s: bio: %p, req: %p, num: %d.\n", __func__, bio, priv, nr_vecs);
+	}
+	return bio;
+}
+
+static size_t kaio_vfs_read_actor(struct kaio_private *priv, struct page *page, size_t len)
+{
+	size_t ret;
+	struct socket *sock = priv->dptr;
+	struct file *file = sock->file;
+	
+	ret = file->f_op->sendpage(file, page, 0, len, &file->f_pos, 1);
+
+	dprintk("%s: page=%p, len=%zu, ret=%zd.\n", 
+			__func__, page, len, ret);
+
+	return ret;
+}
+
+static int kaio_vfs_read(struct kaio_private *priv,
+		size_t (*actor)(struct kaio_private *, struct page *, size_t))
+{
+	struct address_space *mapping;
+	struct file *file = priv->sptr;
+	size_t isize, actor_size;
+	int i, pg_num;
+
+	mapping = file->f_mapping;
+	isize = i_size_read(file->f_dentry->d_inode);
+
+	if (priv->processed >= isize) {
+		priv->count = 0;
+		return 0;
+	}
+	priv->count = isize - priv->processed;
+	pg_num = ALIGN(min_t(u64, isize, priv->count), PAGE_SIZE) >> PAGE_SHIFT;
+
+	for (i=0; i<pg_num && priv->count; ++i) {
+		struct page *page;
+		size_t nr = PAGE_CACHE_SIZE;
+
+		page = find_get_page(mapping, priv->processed >> PAGE_CACHE_SHIFT);
+		if (unlikely(page == NULL))
+			break;
+		if (!PageUptodate(page)) {
+			dprintk("%s: %2d: page=%p, processed=%Lu, count=%Lu not uptodate.\n", 
+					__func__, i, page, priv->processed, priv->count);
+			page_cache_release(page);
+			break;
+		}
+
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		mark_page_accessed(page);
+
+		if (nr + priv->processed > isize)
+			nr = isize - priv->processed;
+		if (nr > priv->count)
+			nr = priv->count;
+
+		actor_size = actor(priv, page, nr);
+		if (actor_size < 0) {
+			page_cache_release(page);
+			break;
+		}
+
+		page_cache_release(page);
+
+		priv->processed += actor_size;
+		priv->count -= actor_size;
+	}
+
+	if (!priv->count)
+		i = pg_num;
+
+	dprintk("%s: end: ret=%d, num=%d, left=%Lu, offset=%Lu, processed=%Lu.\n", 
+			__func__, i, pg_num, priv->count, priv->offset, priv->processed);
+
+	return i;
+}
+
+static int kaio_alloc_cached_page(struct file *file, __u64 offset, struct page **cached_page)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct page *page;
+	int err = 0;
+	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+
+	page = page_cache_alloc_cold(mapping);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+	page->index = index;
+#if 0
+	err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+	if (err) {
+		page_cache_release(page);
+		goto out;
+	}
+#endif
+	*cached_page = page;
+
+out:
+	return err;
+}
+
+static int kaio_read_send_pages(struct kaio_req *req, int direct)
+{
+	struct kaio_private *priv = req->priv;
+	struct file *file = priv->sptr;
+	struct address_space *mapping = file->f_mapping;
+	struct page *page;
+	int err, i, num;
+	//unsigned long limit = (2*1024*1024) >> PAGE_CACHE_SHIFT;
+	unsigned long limit = 128;
+	LIST_HEAD(page_pool);
+
+	kaio_vfs_read(priv, &kaio_vfs_read_actor);
+
+	if (!priv->count) {
+		kevent_storage_ready(&priv->kevent_user->st, NULL, KEVENT_MASK_ALL);
+		kevent_user_put(priv->kevent_user);
+		return 0;
+	}
+
+	kaio_append_call(req, kaio_read_send_pages, -1);
+
+	num = min_t(int, max_sane_readahead(limit), 
+			ALIGN(priv->count, PAGE_SIZE) >> PAGE_SHIFT);
+
+	for (i=0; i<num; ++i) {
+		page = NULL;
+		err = kaio_alloc_cached_page(file, priv->offset, &page);
+		if (err || !page)
+			break;
+		dprintk("%s: i: %d, j: %d, num: %d, page: %p, err: %d, pool: %p, prev: %p, next: %p.\n", 
+				__func__, i, j, num, page, err, &page_pool, 
+				list_entry((&page_pool)->prev, struct page, lru), 
+				list_entry((&page_pool)->next, struct page, lru));
+
+		list_add(&page->lru, &page_pool);
+
+		priv->offset += PAGE_CACHE_SIZE;
+	}
+
+	dprintk("%s: submit: pool: %p, page: %p, num: %d.\n",
+			__func__, &page_pool, list_entry((&page_pool)->prev, struct page, lru),
+			i);
+
+	err = mapping->a_ops->aio_readpages(file, mapping, &page_pool, i, req);
+	if (err) {
+		dprintk("%s: kevent_mpage_readpages failed: err=%d, count=%Lu.\n",
+				__func__, err, priv->count);
+		return err;
+	}
+
+	return 1;
+}
+
+extern struct file_operations kevent_user_fops;
+
+static int kaio_add_kevent(int fd, struct kaio_req *req)
+{
+	struct ukevent uk;
+	struct file *file;
+	struct kevent_user *u;
+	int err, need_fput = 0;
+
+	file = fget_light(fd, &need_fput);
+	if (!file) {
+		err = -EBADF;
+		goto err_out;
+	}
+
+	if (file->f_op != &kevent_user_fops) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	u = file->private_data;
+
+	memset(&uk, 0, sizeof(struct ukevent));
+
+	uk.event = KEVENT_MASK_ALL;
+	uk.type = KEVENT_AIO;
+	uk.id.raw_u64 = (unsigned long)(req);
+	uk.req_flags = KEVENT_REQ_ONESHOT | KEVENT_REQ_ALWAYS_QUEUE;
+	uk.ptr = req;
+
+	err = kevent_user_add_ukevent(&uk, u);
+	if (err)
+		goto err_out_fput;
+
+	kevent_user_get(u);
+
+	fput_light(file, need_fput);
+
+	return 0;
+
+err_out_fput:
+	fput_light(file, need_fput);
+err_out:
+	return err;
+}
+
+static void kaio_destructor(struct kaio_req *req)
+{
+	kmem_cache_free(kaio_priv_cache, req->priv);
+}
+
+asmlinkage long sys_aio_sendfile(int kevent_fd, int sock_fd, int in_fd, off_t offset, size_t count)
+{
+	struct kaio_req *req;
+	struct file *file;
+	struct socket *sock;
+	struct kaio_private *priv;
+	int err;
+
+	file = fget(in_fd);
+	if (!file) {
+		err = -EBADF;
+		goto err_out_exit;
+	}
+
+	sock = sockfd_lookup(sock_fd, &err);
+	if (!sock)
+		goto err_out_fput;
+
+	priv = kmem_cache_alloc(kaio_priv_cache, GFP_KERNEL);
+	if (!priv) {
+		err = -ENOMEM;
+		goto err_out_sput;
+	}
+
+	priv->sptr = file;
+	priv->dptr = sock;
+	priv->offset = offset;
+	priv->count = min_t(u64, i_size_read(file->f_dentry->d_inode), count);
+	priv->processed = offset;
+
+	req = kaio_add_call(NULL, &kaio_read_send_pages, -1, GFP_KERNEL);
+	if (!req) {
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
+	dprintk("%s: req: %p, priv: %p, call: %p [%p], offset: %Lu, processed: %Lu, count: %Lu.\n",
+			__func__, req, priv, &kaio_read_send_pages, 
+			kaio_read_send_pages, priv->offset, priv->processed, priv->count);
+
+	req->destructor = kaio_destructor;
+	req->priv = priv;
+
+	err = kaio_add_kevent(kevent_fd, req);
+	if (err)
+		goto err_out_remove;
+
+	kaio_schedule_req(req);
+
+	return (long)req;
+
+err_out_remove:
+	/* It is safe to just free the object since it is guaranteed that it was not
+	 * queued for processing.
+	 */
+	kmem_cache_free(kaio_req_cache, req);
+err_out_free:
+	kmem_cache_free(kaio_priv_cache, priv);
+err_out_sput:
+	sockfd_put(sock);
+err_out_fput:
+	fput(file);
+err_out_exit:
+	return err;
+}
+
+static int kevent_aio_callback(struct kevent *k)
+{
+	struct kaio_req *req = k->event.ptr;
+	struct kaio_private *priv = req->priv;
+	
+	dprintk("%s: req: %p, priv: %p, processed: %Lu, offset: %Lu, count: %Lu.\n", 
+			__func__, req, priv, priv->processed, priv->offset, priv->count);
+
+	if (!priv->count) {
+		__u32 *processed = (__u32 *)&priv->processed;
+		k->event.ret_data[0] = processed[0];
+		k->event.ret_data[1] = processed[1];
+		return 1;
+	}
+
+	return 0;
+}
+
+int kevent_aio_enqueue(struct kevent *k)
+{
+	int err;
+	struct kaio_req *req = k->event.ptr;
+	struct kaio_private *priv = req->priv;
+
+	err = kevent_storage_enqueue(&k->user->st, k);
+	if (err)
+		goto err_out_exit;
+
+	priv->kevent_user = k->user;
+	dprintk("%s: req: %p, priv: %p, st: %p.\n", __func__, req, priv, priv->st);
+
+	if (k->event.req_flags & KEVENT_REQ_ALWAYS_QUEUE)
+		kevent_requeue(k);
+
+	return 0;
+
+err_out_exit:
+	return err;
+}
+
+int kevent_aio_dequeue(struct kevent *k)
+{
+	kevent_storage_dequeue(k->st, k);
+
+	return 0;
+}
+
+static void kaio_thread_stop(struct kaio_thread *th)
+{
+	kthread_stop(th->thread);
+	kfree(th);
+}
+
+static int kaio_thread_start(struct kaio_thread **thp, int num)
+{
+	struct kaio_thread *th;
+
+	th = kzalloc(sizeof(struct kaio_thread), GFP_KERNEL);
+	if (!th)
+		return -ENOMEM;
+
+	th->refcnt = 1;
+	spin_lock_init(&th->req_lock);
+	INIT_LIST_HEAD(&th->req_list);
+	init_waitqueue_head(&th->wait);
+
+	th->thread = kthread_run(kaio_thread_process, th, "kaio/%d", num);
+	if (IS_ERR(th->thread)) {
+		int err = PTR_ERR(th->thread);
+		kfree(th);
+		return err;
+	}
+
+	*thp = th;
+	wmb();
+
+	return 0;
+}
+
+static int __init kevent_init_aio(void)
+{
+	struct kevent_callbacks sc = {
+		.callback = &kevent_aio_callback,
+		.enqueue = &kevent_aio_enqueue,
+		.dequeue = &kevent_aio_dequeue,
+		.flags = 0,
+	};
+	int err, i;
+
+	kaio_req_cache = kmem_cache_create("kaio_req", sizeof(struct kaio_req), 
+			0, SLAB_PANIC, NULL, NULL);
+	kaio_priv_cache = kmem_cache_create("kaio_priv", sizeof(struct kaio_private), 
+			0, SLAB_PANIC, NULL, NULL);
+
+	memset(kaio_threads, 0, sizeof(kaio_threads));
+
+	for (i=0; i<KAIO_THREAD_NUM; ++i) {
+		err = kaio_thread_start(&kaio_threads[i], i);
+		if (err)
+			goto err_out_stop;
+	}
+
+	err = kevent_add_callbacks(&sc, KEVENT_AIO);
+	if (err)
+		goto err_out_stop;
+
+	return 0;
+
+err_out_stop:
+	while (--i >= 0) {
+		struct kaio_thread *th = kaio_threads[i];
+
+		kaio_threads[i] = NULL;
+		wmb();
+
+		kaio_thread_stop(th);
+	}
+	return err;
+}
+module_init(kevent_init_aio);


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [take31 8/10] kevent: Kevent posix timer notifications.
  2007-01-08 19:25               ` [take31 7/10] kevent: Signal notifications Evgeniy Polyakov
@ 2007-01-08 19:26                 ` Evgeniy Polyakov
  2007-01-08 19:26                   ` [take31 9/10] kevent: Private userspace notifications Evgeniy Polyakov
  0 siblings, 1 reply; 76+ messages in thread
From: Evgeniy Polyakov @ 2007-01-08 19:26 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, Ulrich Drepper, Andrew Morton, Evgeniy Polyakov,
	netdev, Zach Brown, Christoph Hellwig, Chase Venters,
	Johann Borck, linux-kernel, Jeff Garzik, Jamal Hadi Salim,
	Ingo Molnar


Kevent posix timer notifications.

Simple extensions to POSIX timers which allows
to deliver notification of the timer expiration
through kevent queue.

Example application posix_timer.c can be found
in archive on project homepage.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>


diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 8786e01..3768746 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -235,6 +235,7 @@ typedef struct siginfo {
 #define SIGEV_NONE	1	/* other notification: meaningless */
 #define SIGEV_THREAD	2	/* deliver via thread creation */
 #define SIGEV_THREAD_ID 4	/* deliver to thread */
+#define SIGEV_KEVENT	8	/* deliver through kevent queue */
 
 /*
  * This works because the alignment is ok on all current architectures
@@ -260,6 +261,8 @@ typedef struct sigevent {
 			void (*_function)(sigval_t);
 			void *_attribute;	/* really pthread_attr_t */
 		} _sigev_thread;
+
+		int kevent_fd;
 	} _sigev_un;
 } sigevent_t;
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f..4b9deb4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/kevent_storage.h>
 
 union cpu_time_count {
 	cputime_t cpu;
@@ -49,6 +50,9 @@ struct k_itimer {
 	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
+#ifdef CONFIG_KEVENT_TIMER
+	struct kevent_storage st;
+#endif
 	union {
 		struct {
 			struct hrtimer timer;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5fe87de..5ec805e 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,6 +48,8 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/module.h>
+#include <linux/kevent.h>
+#include <linux/file.h>
 
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
@@ -224,6 +226,100 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+#ifdef CONFIG_KEVENT_TIMER
+static int posix_kevent_enqueue(struct kevent *k)
+{
+	/*
+	 * It is not ugly - there is no pointer in the id field union, 
+	 * but its size is 64bits, which is ok for any known pointer size.
+	 */
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	return kevent_storage_enqueue(&tmr->st, k);
+}
+static int posix_kevent_dequeue(struct kevent *k)
+{
+	struct k_itimer *tmr = (struct k_itimer *)(unsigned long)k->event.id.raw_u64;
+	kevent_storage_dequeue(&tmr->st, k);
+	return 0;
+}
+static int posix_kevent_callback(struct kevent *k)
+{
+	return 1;
+}
+static int posix_kevent_init(void)
+{
+	struct kevent_callbacks tc = {
+		.callback = &posix_kevent_callback,
+		.enqueue = &posix_kevent_enqueue,
+		.dequeue = &posix_kevent_dequeue,
+		.flags = KEVENT_CALLBACKS_KERNELONLY};
+
+	return kevent_add_callbacks(&tc, KEVENT_POSIX_TIMER);
+}
+
+extern struct file_operations kevent_user_fops;
+
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	struct ukevent uk;
+	struct file *file;
+	struct kevent_user *u;
+	int err;
+
+	file = fget(fd);
+	if (!file) {
+		err = -EBADF;
+		goto err_out;
+	}
+
+	if (file->f_op != &kevent_user_fops) {
+		err = -EINVAL;
+		goto err_out_fput;
+	}
+
+	u = file->private_data;
+
+	memset(&uk, 0, sizeof(struct ukevent));
+
+	uk.event = KEVENT_MASK_ALL;
+	uk.type = KEVENT_POSIX_TIMER;
+	uk.id.raw_u64 = (unsigned long)(tmr); /* Just cast to something unique */
+	uk.req_flags = KEVENT_REQ_ONESHOT | KEVENT_REQ_ALWAYS_QUEUE;
+	uk.ptr = tmr->it_sigev_value.sival_ptr;
+
+	err = kevent_user_add_ukevent(&uk, u);
+	if (err)
+		goto err_out_fput;
+
+	fput(file);
+
+	return 0;
+
+err_out_fput:
+	fput(file);
+err_out:
+	return err;
+}
+
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+	kevent_storage_fini(&tmr->st);
+}
+#else
+static int posix_kevent_init_timer(struct k_itimer *tmr, int fd)
+{
+	return -ENOSYS;
+}
+static int posix_kevent_init(void)
+{
+	return 0;
+}
+static void posix_kevent_fini_timer(struct k_itimer *tmr)
+{
+}
+#endif
+
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -241,6 +337,11 @@ static __init int init_posix_timers(void)
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
+	if (posix_kevent_init()) {
+		printk(KERN_ERR "Failed to initialize kevent posix timers.\n");
+		BUG();
+	}
+
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, 0, NULL, NULL);
 	idr_init(&posix_timers_id);
@@ -343,23 +444,29 @@ static int posix_timer_fn(struct hrtimer *timer)
 
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
+	
+	if (timr->it_sigev_notify == SIGEV_KEVENT) {
+#ifdef CONFIG_KEVENT_TIMER
+		kevent_storage_ready(&timr->st, NULL, KEVENT_MASK_ALL);
+#endif
+	} else {
+		if (timr->it.real.interval.tv64 != 0)
+			si_private = ++timr->it_requeue_pending;
 
-	if (timr->it.real.interval.tv64 != 0)
-		si_private = ++timr->it_requeue_pending;
-
-	if (posix_timer_event(timr, si_private)) {
-		/*
-		 * signal was not sent because of sig_ignor
-		 * we will not get a call back to restart it AND
-		 * it should be restarted.
-		 */
-		if (timr->it.real.interval.tv64 != 0) {
-			timr->it_overrun +=
-				hrtimer_forward(timer,
-						timer->base->softirq_time,
-						timr->it.real.interval);
-			ret = HRTIMER_RESTART;
-			++timr->it_requeue_pending;
+		if (posix_timer_event(timr, si_private)) {
+			/*
+			 * signal was not sent because of sig_ignor
+			 * we will not get a call back to restart it AND
+			 * it should be restarted.
+			 */
+			if (timr->it.real.interval.tv64 != 0) {
+				timr->it_overrun +=
+					hrtimer_forward(timer,
+							timer->base->softirq_time,
+							timr->it.real.interval);
+				ret = HRTIMER_RESTART;
+				++timr->it_requeue_pending;
+			}
 		}
 	}
 
@@ -407,6 +514,9 @@ static struct k_itimer * alloc_posix_timer(void)
 		kmem_cache_free(posix_timers_cache, tmr);
 		tmr = NULL;
 	}
+#ifdef CONFIG_KEVENT_TIMER
+	kevent_storage_init(tmr, &tmr->st);
+#endif
 	return tmr;
 }
 
@@ -424,6 +534,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	if (unlikely(tmr->it_process) &&
 	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
 		put_task_struct(tmr->it_process);
+	posix_kevent_fini_timer(tmr);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -496,40 +607,52 @@ sys_timer_create(const clockid_t which_clock,
 		new_timer->it_sigev_signo = event.sigev_signo;
 		new_timer->it_sigev_value = event.sigev_value;
 
-		read_lock(&tasklist_lock);
-		if ((process = good_sigevent(&event))) {
-			/*
-			 * We may be setting up this process for another
-			 * thread.  It may be exiting.  To catch this
-			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the siglock will catch
-			 * him before it is too late (in exit_itimers).
-			 *
-			 * The exec case is a bit more invloved but easy
-			 * to code.  If the process is in our thread
-			 * group (and it must be or we would not allow
-			 * it here) and is doing an exec, it will cause
-			 * us to be killed.  In this case it will wait
-			 * for us to die which means we can finish this
-			 * linkage with our last gasp. I.e. no code :)
-			 */
+		if (event.sigev_notify == SIGEV_KEVENT) {
+			error = posix_kevent_init_timer(new_timer, event._sigev_un.kevent_fd);
+			if (error)
+				goto out;
+
+			process = current->group_leader;
 			spin_lock_irqsave(&process->sighand->siglock, flags);
-			if (!(process->flags & PF_EXITING)) {
-				new_timer->it_process = process;
-				list_add(&new_timer->list,
-					 &process->signal->posix_timers);
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-					get_task_struct(process);
-			} else {
-				spin_unlock_irqrestore(&process->sighand->siglock, flags);
-				process = NULL;
+			new_timer->it_process = process;
+			list_add(&new_timer->list, &process->signal->posix_timers);
+			spin_unlock_irqrestore(&process->sighand->siglock, flags);
+		} else {
+			read_lock(&tasklist_lock);
+			if ((process = good_sigevent(&event))) {
+				/*
+				 * We may be setting up this process for another
+				 * thread.  It may be exiting.  To catch this
+				 * case the we check the PF_EXITING flag.  If
+				 * the flag is not set, the siglock will catch
+				 * him before it is too late (in exit_itimers).
+				 *
+				 * The exec case is a bit more invloved but easy
+				 * to code.  If the process is in our thread
+				 * group (and it must be or we would not allow
+				 * it here) and is doing an exec, it will cause
+				 * us to be killed.  In this case it will wait
+				 * for us to die which means we can finish this
+				 * linkage with our last gasp. I.e. no code :)
+				 */
+				spin_lock_irqsave(&process->sighand->siglock, flags);
+				if (!(process->flags & PF_EXITING)) {
+					new_timer->it_process = process;
+					list_add(&new_timer->list,
+						 &process->signal->posix_timers);
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+						get_task_struct(process);
+				} else {
+					spin_unlock_irqrestore(&process->sighand->siglock, flags);
+					process = NULL;
+				}
+			}
+			read_unlock(&tasklist_lock);
+			if (!process) {
+				error = -EINVAL;
+				goto out;
 			}
-		}
-		read_unlock(&tasklist_lock);
-		if (!process) {
-			error = -EINVAL;
-			goto out;
 		}
 	} else {
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;


^ permalink raw reply	[flat|nested] 76+ messages in thread

end of thread, other threads:[~2007-01-08 19:30 UTC | newest]

Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <3154985aa0591036@2ka.mipt.ru>
2006-12-17 13:53 ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-12-17 13:53   ` [take28-resend_2->0 1/8] kevent: Description Evgeniy Polyakov
2006-12-17 13:53     ` [take28-resend_2->0 2/8] kevent: Core files Evgeniy Polyakov
2006-12-17 13:53       ` [take28-resend_2->0 3/8] kevent: poll/select() notifications Evgeniy Polyakov
2006-12-17 13:53         ` [take28-resend_2->0 4/8] kevent: Socket notifications Evgeniy Polyakov
2006-12-17 13:53           ` [take28-resend_2->0 5/8] kevent: Timer notifications Evgeniy Polyakov
2006-12-17 13:53             ` [take28-resend_2->0 6/8] kevent: Pipe notifications Evgeniy Polyakov
2006-12-17 13:53               ` [take28-resend_2->0 7/8] kevent: Signal notifications Evgeniy Polyakov
2006-12-17 13:53                 ` [take28-resend_2->0 8/8] kevent: Kevent posix timer notifications Evgeniy Polyakov
2006-12-19  3:47   ` [take28-resend_2->0 0/8] kevent: Generic event handling mechanism Ulrich Drepper
2006-12-19  4:01     ` Randy Dunlap
2006-12-19  4:51     ` Evgeniy Polyakov
2006-12-19  6:21       ` Ulrich Drepper
2006-12-19  6:38         ` Evgeniy Polyakov
2006-12-19  8:01           ` Ulrich Drepper
2006-12-19  8:56             ` Evgeniy Polyakov
2006-12-21  9:14 ` [take28-resend_1->0 " Evgeniy Polyakov
2006-12-21  9:14   ` [take28-resend_1->0 1/8] kevent: Description Evgeniy Polyakov
2006-12-21  9:14     ` [take28-resend_1->0 2/8] kevent: Core files Evgeniy Polyakov
2006-12-21  9:14       ` [take28-resend_1->0 3/8] kevent: poll/select() notifications Evgeniy Polyakov
2006-12-21  9:14         ` [take28-resend_1->0 4/8] kevent: Socket notifications Evgeniy Polyakov
2006-12-21  9:14           ` [take28-resend_1->0 5/8] kevent: Timer notifications Evgeniy Polyakov
2006-12-21  9:14             ` [take28-resend_1->0 6/8] kevent: Pipe notifications Evgeniy Polyakov
2006-12-21  9:14               ` [take28-resend_1->0 7/8] kevent: Signal notifications Evgeniy Polyakov
2006-12-21  9:14                 ` [take28-resend_1->0 8/8] kevent: Kevent posix timer notifications Evgeniy Polyakov
2006-12-21 10:35   ` [take28-resend_1->0 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-12-21 10:41     ` Jeff Garzik
2006-12-21 10:49       ` Evgeniy Polyakov
2006-12-21 10:57         ` Evgeniy Polyakov
2006-12-21 13:48         ` jamal
2006-12-21 14:04           ` Evgeniy Polyakov
2006-12-21 14:21             ` jamal
2006-12-21 14:23               ` Evgeniy Polyakov
2006-12-21 14:36                 ` Evgeniy Polyakov
2006-12-21 14:40                   ` jamal
2006-12-21 14:46                     ` Evgeniy Polyakov
2006-12-21 16:42                       ` jamal
2006-12-21 16:51                         ` Evgeniy Polyakov
2006-12-23 16:51 ` [take29 " Evgeniy Polyakov
2006-12-23 16:51   ` [take29 1/8] kevent: Description Evgeniy Polyakov
2006-12-23 16:51     ` [take29 2/8] kevent: Core files Evgeniy Polyakov
2006-12-23 16:51       ` [take29 3/8] kevent: poll/select() notifications Evgeniy Polyakov
2006-12-23 16:51         ` [take29 4/8] kevent: Socket notifications Evgeniy Polyakov
2006-12-23 16:51           ` [take29 5/8] kevent: Timer notifications Evgeniy Polyakov
2006-12-23 16:51             ` [take29 6/8] kevent: Pipe notifications Evgeniy Polyakov
2006-12-23 16:51               ` [take29 7/8] kevent: Signal notifications Evgeniy Polyakov
2006-12-23 16:51                 ` [take29 8/8] kevent: Kevent posix timer notifications Evgeniy Polyakov
2006-12-23 17:10   ` [take29 0/8] kevent: Generic event handling mechanism Evgeniy Polyakov
2006-12-28 15:56   ` Ingo Molnar
2006-12-29  8:48     ` Evgeniy Polyakov
2006-12-28 16:01   ` Ingo Molnar
2006-12-29  8:55     ` Evgeniy Polyakov
2006-12-29 12:54       ` Ingo Molnar
2006-12-29 13:14         ` Evgeniy Polyakov
2006-12-29 13:24           ` Ingo Molnar
2006-12-29 12:25 ` [take30 0/9] " Evgeniy Polyakov
2006-12-29 12:25   ` [take30 1/9] kevent: Description Evgeniy Polyakov
2006-12-29 12:25     ` [take30 2/9] kevent: Core files Evgeniy Polyakov
2006-12-29 12:25       ` [take30 3/9] kevent: poll/select() notifications Evgeniy Polyakov
2006-12-29 12:25         ` [take30 4/9] kevent: Socket notifications Evgeniy Polyakov
2006-12-29 12:25           ` [take30 5/9] kevent: Timer notifications Evgeniy Polyakov
2006-12-29 12:25             ` [take30 6/9] kevent: Pipe notifications Evgeniy Polyakov
2006-12-29 12:25               ` [take30 7/9] kevent: Signal notifications Evgeniy Polyakov
2006-12-29 12:25                 ` [take30 8/9] kevent: Kevent posix timer notifications Evgeniy Polyakov
2006-12-29 12:25                   ` [take30 9/9] kevent: Private userspace notifications Evgeniy Polyakov
2007-01-08 19:25 ` [take31 0/10] kevent: Generic event handling mechanism Evgeniy Polyakov
2007-01-08 19:25   ` [take31 1/10] kevent: Description Evgeniy Polyakov
2007-01-08 19:25     ` [take31 2/10] kevent: Core files Evgeniy Polyakov
2007-01-08 19:25       ` [take31 3/10] kevent: poll/select() notifications Evgeniy Polyakov
2007-01-08 19:25         ` [take31 4/10] kevent: Socket notifications Evgeniy Polyakov
2007-01-08 19:25           ` [take31 5/10] kevent: Timer notifications Evgeniy Polyakov
2007-01-08 19:25             ` [take31 6/10] kevent: Pipe notifications Evgeniy Polyakov
2007-01-08 19:25               ` [take31 7/10] kevent: Signal notifications Evgeniy Polyakov
2007-01-08 19:26                 ` [take31 8/10] kevent: Kevent posix timer notifications Evgeniy Polyakov
2007-01-08 19:26                   ` [take31 9/10] kevent: Private userspace notifications Evgeniy Polyakov
2007-01-08 19:26                     ` [take31 10/10] kevent: Kevent based AIO (aio_sendfile()) Evgeniy Polyakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).