1/*
2 * Implement CPU time clocks for the POSIX clock interface.
3 */
4
5#include <linux/sched.h>
6#include <linux/posix-timers.h>
7#include <linux/errno.h>
8#include <linux/math64.h>
9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
12
13/*
14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
18 */
19void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
20{
21	cputime_t cputime = secs_to_cputime(rlim_new);
22
23	spin_lock_irq(&task->sighand->siglock);
24	set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
25	spin_unlock_irq(&task->sighand->siglock);
26}
27
28static int check_clock(const clockid_t which_clock)
29{
30	int error = 0;
31	struct task_struct *p;
32	const pid_t pid = CPUCLOCK_PID(which_clock);
33
34	if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
35		return -EINVAL;
36
37	if (pid == 0)
38		return 0;
39
40	read_lock(&tasklist_lock);
41	p = find_task_by_vpid(pid);
42	if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
43		   same_thread_group(p, current) : thread_group_leader(p))) {
44		error = -EINVAL;
45	}
46	read_unlock(&tasklist_lock);
47
48	return error;
49}
50
51static inline union cpu_time_count
52timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
53{
54	union cpu_time_count ret;
55	ret.sched = 0;		/* high half always zero when .cpu used */
56	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
57		ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
58	} else {
59		ret.cpu = timespec_to_cputime(tp);
60	}
61	return ret;
62}
63
64static void sample_to_timespec(const clockid_t which_clock,
65			       union cpu_time_count cpu,
66			       struct timespec *tp)
67{
68	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
69		*tp = ns_to_timespec(cpu.sched);
70	else
71		cputime_to_timespec(cpu.cpu, tp);
72}
73
74static inline int cpu_time_before(const clockid_t which_clock,
75				  union cpu_time_count now,
76				  union cpu_time_count then)
77{
78	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79		return now.sched < then.sched;
80	}  else {
81		return cputime_lt(now.cpu, then.cpu);
82	}
83}
84static inline void cpu_time_add(const clockid_t which_clock,
85				union cpu_time_count *acc,
86			        union cpu_time_count val)
87{
88	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89		acc->sched += val.sched;
90	}  else {
91		acc->cpu = cputime_add(acc->cpu, val.cpu);
92	}
93}
94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
95						union cpu_time_count a,
96						union cpu_time_count b)
97{
98	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99		a.sched -= b.sched;
100	}  else {
101		a.cpu = cputime_sub(a.cpu, b.cpu);
102	}
103	return a;
104}
105
106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114	cputime_t res = cputime_div(time, div);
115
116	return max_t(cputime_t, res, 1);
117}
118
119/*
120 * Update expiry time from increment, and increase overrun count,
121 * given the current clock sample.
122 */
123static void bump_cpu_timer(struct k_itimer *timer,
124				  union cpu_time_count now)
125{
126	int i;
127
128	if (timer->it.cpu.incr.sched == 0)
129		return;
130
131	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
132		unsigned long long delta, incr;
133
134		if (now.sched < timer->it.cpu.expires.sched)
135			return;
136		incr = timer->it.cpu.incr.sched;
137		delta = now.sched + incr - timer->it.cpu.expires.sched;
138		/* Don't use (incr*2 < delta), incr*2 might overflow. */
139		for (i = 0; incr < delta - incr; i++)
140			incr = incr << 1;
141		for (; i >= 0; incr >>= 1, i--) {
142			if (delta < incr)
143				continue;
144			timer->it.cpu.expires.sched += incr;
145			timer->it_overrun += 1 << i;
146			delta -= incr;
147		}
148	} else {
149		cputime_t delta, incr;
150
151		if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
152			return;
153		incr = timer->it.cpu.incr.cpu;
154		delta = cputime_sub(cputime_add(now.cpu, incr),
155				    timer->it.cpu.expires.cpu);
156		/* Don't use (incr*2 < delta), incr*2 might overflow. */
157		for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
158			     incr = cputime_add(incr, incr);
159		for (; i >= 0; incr = cputime_halve(incr), i--) {
160			if (cputime_lt(delta, incr))
161				continue;
162			timer->it.cpu.expires.cpu =
163				cputime_add(timer->it.cpu.expires.cpu, incr);
164			timer->it_overrun += 1 << i;
165			delta = cputime_sub(delta, incr);
166		}
167	}
168}
169
170static inline cputime_t prof_ticks(struct task_struct *p)
171{
172	return cputime_add(p->utime, p->stime);
173}
174static inline cputime_t virt_ticks(struct task_struct *p)
175{
176	return p->utime;
177}
178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{
181	int error = check_clock(which_clock);
182	if (!error) {
183		tp->tv_sec = 0;
184		tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
185		if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
186			/*
187			 * If sched_clock is using a cycle counter, we
188			 * don't have any idea of its true resolution
189			 * exported, but it is much more than 1s/HZ.
190			 */
191			tp->tv_nsec = 1;
192		}
193	}
194	return error;
195}
196
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{
199	/*
200	 * You can never reset a CPU clock, but we check for other errors
201	 * in the call before failing with EPERM.
202	 */
203	int error = check_clock(which_clock);
204	if (error == 0) {
205		error = -EPERM;
206	}
207	return error;
208}
209
210
211/*
212 * Sample a per-thread clock for the given task.
213 */
214static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
215			    union cpu_time_count *cpu)
216{
217	switch (CPUCLOCK_WHICH(which_clock)) {
218	default:
219		return -EINVAL;
220	case CPUCLOCK_PROF:
221		cpu->cpu = prof_ticks(p);
222		break;
223	case CPUCLOCK_VIRT:
224		cpu->cpu = virt_ticks(p);
225		break;
226	case CPUCLOCK_SCHED:
227		cpu->sched = task_sched_runtime(p);
228		break;
229	}
230	return 0;
231}
232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{
235	struct signal_struct *sig = tsk->signal;
236	struct task_struct *t;
237
238	times->utime = sig->utime;
239	times->stime = sig->stime;
240	times->sum_exec_runtime = sig->sum_sched_runtime;
241
242	rcu_read_lock();
243	/* make sure we can trust tsk->thread_group list */
244	if (!likely(pid_alive(tsk)))
245		goto out;
246
247	t = tsk;
248	do {
249		times->utime = cputime_add(times->utime, t->utime);
250		times->stime = cputime_add(times->stime, t->stime);
251		times->sum_exec_runtime += t->se.sum_exec_runtime;
252	} while_each_thread(tsk, t);
253out:
254	rcu_read_unlock();
255}
256
257static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
258{
259	if (cputime_gt(b->utime, a->utime))
260		a->utime = b->utime;
261
262	if (cputime_gt(b->stime, a->stime))
263		a->stime = b->stime;
264
265	if (b->sum_exec_runtime > a->sum_exec_runtime)
266		a->sum_exec_runtime = b->sum_exec_runtime;
267}
268
269void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
270{
271	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
272	struct task_cputime sum;
273	unsigned long flags;
274
275	spin_lock_irqsave(&cputimer->lock, flags);
276	if (!cputimer->running) {
277		cputimer->running = 1;
278		/*
279		 * The POSIX timer interface allows for absolute time expiry
280		 * values through the TIMER_ABSTIME flag, therefore we have
281		 * to synchronize the timer to the clock every time we start
282		 * it.
283		 */
284		thread_group_cputime(tsk, &sum);
285		update_gt_cputime(&cputimer->cputime, &sum);
286	}
287	*times = cputimer->cputime;
288	spin_unlock_irqrestore(&cputimer->lock, flags);
289}
290
291/*
292 * Sample a process (thread group) clock for the given group_leader task.
293 * Must be called with tasklist_lock held for reading.
294 */
295static int cpu_clock_sample_group(const clockid_t which_clock,
296				  struct task_struct *p,
297				  union cpu_time_count *cpu)
298{
299	struct task_cputime cputime;
300
301	switch (CPUCLOCK_WHICH(which_clock)) {
302	default:
303		return -EINVAL;
304	case CPUCLOCK_PROF:
305		thread_group_cputime(p, &cputime);
306		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
307		break;
308	case CPUCLOCK_VIRT:
309		thread_group_cputime(p, &cputime);
310		cpu->cpu = cputime.utime;
311		break;
312	case CPUCLOCK_SCHED:
313		cpu->sched = thread_group_sched_runtime(p);
314		break;
315	}
316	return 0;
317}
318
319
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{
322	const pid_t pid = CPUCLOCK_PID(which_clock);
323	int error = -EINVAL;
324	union cpu_time_count rtn;
325
326	if (pid == 0) {
327		/*
328		 * Special case constant value for our own clocks.
329		 * We don't have to do any lookup to find ourselves.
330		 */
331		if (CPUCLOCK_PERTHREAD(which_clock)) {
332			/*
333			 * Sampling just ourselves we can do with no locking.
334			 */
335			error = cpu_clock_sample(which_clock,
336						 current, &rtn);
337		} else {
338			read_lock(&tasklist_lock);
339			error = cpu_clock_sample_group(which_clock,
340						       current, &rtn);
341			read_unlock(&tasklist_lock);
342		}
343	} else {
344		/*
345		 * Find the given PID, and validate that the caller
346		 * should be able to see it.
347		 */
348		struct task_struct *p;
349		rcu_read_lock();
350		p = find_task_by_vpid(pid);
351		if (p) {
352			if (CPUCLOCK_PERTHREAD(which_clock)) {
353				if (same_thread_group(p, current)) {
354					error = cpu_clock_sample(which_clock,
355								 p, &rtn);
356				}
357			} else {
358				read_lock(&tasklist_lock);
359				if (thread_group_leader(p) && p->sighand) {
360					error =
361					    cpu_clock_sample_group(which_clock,
362							           p, &rtn);
363				}
364				read_unlock(&tasklist_lock);
365			}
366		}
367		rcu_read_unlock();
368	}
369
370	if (error)
371		return error;
372	sample_to_timespec(which_clock, rtn, tp);
373	return 0;
374}
375
376
377/*
378 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized.
381 */
382int posix_cpu_timer_create(struct k_itimer *new_timer)
383{
384	int ret = 0;
385	const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
386	struct task_struct *p;
387
388	if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
389		return -EINVAL;
390
391	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
392
393	read_lock(&tasklist_lock);
394	if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
395		if (pid == 0) {
396			p = current;
397		} else {
398			p = find_task_by_vpid(pid);
399			if (p && !same_thread_group(p, current))
400				p = NULL;
401		}
402	} else {
403		if (pid == 0) {
404			p = current->group_leader;
405		} else {
406			p = find_task_by_vpid(pid);
407			if (p && !thread_group_leader(p))
408				p = NULL;
409		}
410	}
411	new_timer->it.cpu.task = p;
412	if (p) {
413		get_task_struct(p);
414	} else {
415		ret = -EINVAL;
416	}
417	read_unlock(&tasklist_lock);
418
419	return ret;
420}
421
422/*
423 * Clean up a CPU-clock timer that is about to be destroyed.
424 * This is called from timer deletion with the timer already locked.
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again.  (This happens when the timer is in the middle of firing.)
427 */
428int posix_cpu_timer_del(struct k_itimer *timer)
429{
430	struct task_struct *p = timer->it.cpu.task;
431	int ret = 0;
432
433	if (likely(p != NULL)) {
434		read_lock(&tasklist_lock);
435		if (unlikely(p->sighand == NULL)) {
436			/*
437			 * We raced with the reaping of the task.
438			 * The deletion should have cleared us off the list.
439			 */
440			BUG_ON(!list_empty(&timer->it.cpu.entry));
441		} else {
442			spin_lock(&p->sighand->siglock);
443			if (timer->it.cpu.firing)
444				ret = TIMER_RETRY;
445			else
446				list_del(&timer->it.cpu.entry);
447			spin_unlock(&p->sighand->siglock);
448		}
449		read_unlock(&tasklist_lock);
450
451		if (!ret)
452			put_task_struct(p);
453	}
454
455	return ret;
456}
457
458/*
459 * Clean out CPU timers still ticking when a thread exited.  The task
460 * pointer is cleared, and the expiry time is replaced with the residual
461 * time for later timer_gettime calls to return.
462 * This must be called with the siglock held.
463 */
464static void cleanup_timers(struct list_head *head,
465			   cputime_t utime, cputime_t stime,
466			   unsigned long long sum_exec_runtime)
467{
468	struct cpu_timer_list *timer, *next;
469	cputime_t ptime = cputime_add(utime, stime);
470
471	list_for_each_entry_safe(timer, next, head, entry) {
472		list_del_init(&timer->entry);
473		if (cputime_lt(timer->expires.cpu, ptime)) {
474			timer->expires.cpu = cputime_zero;
475		} else {
476			timer->expires.cpu = cputime_sub(timer->expires.cpu,
477							 ptime);
478		}
479	}
480
481	++head;
482	list_for_each_entry_safe(timer, next, head, entry) {
483		list_del_init(&timer->entry);
484		if (cputime_lt(timer->expires.cpu, utime)) {
485			timer->expires.cpu = cputime_zero;
486		} else {
487			timer->expires.cpu = cputime_sub(timer->expires.cpu,
488							 utime);
489		}
490	}
491
492	++head;
493	list_for_each_entry_safe(timer, next, head, entry) {
494		list_del_init(&timer->entry);
495		if (timer->expires.sched < sum_exec_runtime) {
496			timer->expires.sched = 0;
497		} else {
498			timer->expires.sched -= sum_exec_runtime;
499		}
500	}
501}
502
503/*
504 * These are both called with the siglock held, when the current thread
505 * is being reaped.  When the final (leader) thread in the group is reaped,
506 * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
507 */
508void posix_cpu_timers_exit(struct task_struct *tsk)
509{
510	cleanup_timers(tsk->cpu_timers,
511		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
512
513}
514void posix_cpu_timers_exit_group(struct task_struct *tsk)
515{
516	struct signal_struct *const sig = tsk->signal;
517
518	cleanup_timers(tsk->signal->cpu_timers,
519		       cputime_add(tsk->utime, sig->utime),
520		       cputime_add(tsk->stime, sig->stime),
521		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
522}
523
524static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
525{
526	/*
527	 * That's all for this thread or process.
528	 * We leave our residual in expires to be reported.
529	 */
530	put_task_struct(timer->it.cpu.task);
531	timer->it.cpu.task = NULL;
532	timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
533					     timer->it.cpu.expires,
534					     now);
535}
536
537static inline int expires_gt(cputime_t expires, cputime_t new_exp)
538{
539	return cputime_eq(expires, cputime_zero) ||
540	       cputime_gt(expires, new_exp);
541}
542
543/*
544 * Insert the timer on the appropriate list before any timers that
545 * expire later.  This must be called with the tasklist_lock held
546 * for reading, interrupts disabled and p->sighand->siglock taken.
547 */
548static void arm_timer(struct k_itimer *timer)
549{
550	struct task_struct *p = timer->it.cpu.task;
551	struct list_head *head, *listpos;
552	struct task_cputime *cputime_expires;
553	struct cpu_timer_list *const nt = &timer->it.cpu;
554	struct cpu_timer_list *next;
555
556	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
557		head = p->cpu_timers;
558		cputime_expires = &p->cputime_expires;
559	} else {
560		head = p->signal->cpu_timers;
561		cputime_expires = &p->signal->cputime_expires;
562	}
563	head += CPUCLOCK_WHICH(timer->it_clock);
564
565	listpos = head;
566	list_for_each_entry(next, head, entry) {
567		if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
568			break;
569		listpos = &next->entry;
570	}
571	list_add(&nt->entry, listpos);
572
573	if (listpos == head) {
574		union cpu_time_count *exp = &nt->expires;
575
576		/*
577		 * We are the new earliest-expiring POSIX 1.b timer, hence
578		 * need to update expiration cache. Take into account that
579		 * for process timers we share expiration cache with itimers
580		 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
581		 */
582
583		switch (CPUCLOCK_WHICH(timer->it_clock)) {
584		case CPUCLOCK_PROF:
585			if (expires_gt(cputime_expires->prof_exp, exp->cpu))
586				cputime_expires->prof_exp = exp->cpu;
587			break;
588		case CPUCLOCK_VIRT:
589			if (expires_gt(cputime_expires->virt_exp, exp->cpu))
590				cputime_expires->virt_exp = exp->cpu;
591			break;
592		case CPUCLOCK_SCHED:
593			if (cputime_expires->sched_exp == 0 ||
594			    cputime_expires->sched_exp > exp->sched)
595				cputime_expires->sched_exp = exp->sched;
596			break;
597		}
598	}
599}
600
601/*
602 * The timer is locked, fire it and arrange for its reload.
603 */
604static void cpu_timer_fire(struct k_itimer *timer)
605{
606	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
607		/*
608		 * User don't want any signal.
609		 */
610		timer->it.cpu.expires.sched = 0;
611	} else if (unlikely(timer->sigq == NULL)) {
612		/*
613		 * This a special case for clock_nanosleep,
614		 * not a normal timer from sys_timer_create.
615		 */
616		wake_up_process(timer->it_process);
617		timer->it.cpu.expires.sched = 0;
618	} else if (timer->it.cpu.incr.sched == 0) {
619		/*
620		 * One-shot timer.  Clear it as soon as it's fired.
621		 */
622		posix_timer_event(timer, 0);
623		timer->it.cpu.expires.sched = 0;
624	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
625		/*
626		 * The signal did not get queued because the signal
627		 * was ignored, so we won't get any callback to
628		 * reload the timer.  But we need to keep it
629		 * ticking in case the signal is deliverable next time.
630		 */
631		posix_cpu_timer_schedule(timer);
632	}
633}
634
635/*
636 * Sample a process (thread group) timer for the given group_leader task.
637 * Must be called with tasklist_lock held for reading.
638 */
639static int cpu_timer_sample_group(const clockid_t which_clock,
640				  struct task_struct *p,
641				  union cpu_time_count *cpu)
642{
643	struct task_cputime cputime;
644
645	thread_group_cputimer(p, &cputime);
646	switch (CPUCLOCK_WHICH(which_clock)) {
647	default:
648		return -EINVAL;
649	case CPUCLOCK_PROF:
650		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
651		break;
652	case CPUCLOCK_VIRT:
653		cpu->cpu = cputime.utime;
654		break;
655	case CPUCLOCK_SCHED:
656		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
657		break;
658	}
659	return 0;
660}
661
662/*
663 * Guts of sys_timer_settime for CPU timers.
664 * This is called with the timer locked and interrupts disabled.
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again.  (This happens when the timer is in the middle of firing.)
667 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669			struct itimerspec *new, struct itimerspec *old)
670{
671	struct task_struct *p = timer->it.cpu.task;
672	union cpu_time_count old_expires, new_expires, old_incr, val;
673	int ret;
674
675	if (unlikely(p == NULL)) {
676		/*
677		 * Timer refers to a dead task's clock.
678		 */
679		return -ESRCH;
680	}
681
682	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
683
684	read_lock(&tasklist_lock);
685	/*
686	 * We need the tasklist_lock to protect against reaping that
687	 * clears p->sighand.  If p has just been reaped, we can no
688	 * longer get any information about it at all.
689	 */
690	if (unlikely(p->sighand == NULL)) {
691		read_unlock(&tasklist_lock);
692		put_task_struct(p);
693		timer->it.cpu.task = NULL;
694		return -ESRCH;
695	}
696
697	/*
698	 * Disarm any old timer after extracting its expiry time.
699	 */
700	BUG_ON(!irqs_disabled());
701
702	ret = 0;
703	old_incr = timer->it.cpu.incr;
704	spin_lock(&p->sighand->siglock);
705	old_expires = timer->it.cpu.expires;
706	if (unlikely(timer->it.cpu.firing)) {
707		timer->it.cpu.firing = -1;
708		ret = TIMER_RETRY;
709	} else
710		list_del_init(&timer->it.cpu.entry);
711
712	/*
713	 * We need to sample the current value to convert the new
714	 * value from to relative and absolute, and to convert the
715	 * old value from absolute to relative.  To set a process
716	 * timer, we need a sample to balance the thread expiry
717	 * times (in arm_timer).  With an absolute time, we must
718	 * check if it's already passed.  In short, we need a sample.
719	 */
720	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
721		cpu_clock_sample(timer->it_clock, p, &val);
722	} else {
723		cpu_timer_sample_group(timer->it_clock, p, &val);
724	}
725
726	if (old) {
727		if (old_expires.sched == 0) {
728			old->it_value.tv_sec = 0;
729			old->it_value.tv_nsec = 0;
730		} else {
731			/*
732			 * Update the timer in case it has
733			 * overrun already.  If it has,
734			 * we'll report it as having overrun
735			 * and with the next reloaded timer
736			 * already ticking, though we are
737			 * swallowing that pending
738			 * notification here to install the
739			 * new setting.
740			 */
741			bump_cpu_timer(timer, val);
742			if (cpu_time_before(timer->it_clock, val,
743					    timer->it.cpu.expires)) {
744				old_expires = cpu_time_sub(
745					timer->it_clock,
746					timer->it.cpu.expires, val);
747				sample_to_timespec(timer->it_clock,
748						   old_expires,
749						   &old->it_value);
750			} else {
751				old->it_value.tv_nsec = 1;
752				old->it_value.tv_sec = 0;
753			}
754		}
755	}
756
757	if (unlikely(ret)) {
758		/*
759		 * We are colliding with the timer actually firing.
760		 * Punt after filling in the timer's old value, and
761		 * disable this firing since we are already reporting
762		 * it as an overrun (thanks to bump_cpu_timer above).
763		 */
764		spin_unlock(&p->sighand->siglock);
765		read_unlock(&tasklist_lock);
766		goto out;
767	}
768
769	if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
770		cpu_time_add(timer->it_clock, &new_expires, val);
771	}
772
773	/*
774	 * Install the new expiry time (or zero).
775	 * For a timer with no notification action, we don't actually
776	 * arm the timer (we'll just fake it for timer_gettime).
777	 */
778	timer->it.cpu.expires = new_expires;
779	if (new_expires.sched != 0 &&
780	    cpu_time_before(timer->it_clock, val, new_expires)) {
781		arm_timer(timer);
782	}
783
784	spin_unlock(&p->sighand->siglock);
785	read_unlock(&tasklist_lock);
786
787	/*
788	 * Install the new reload setting, and
789	 * set up the signal and overrun bookkeeping.
790	 */
791	timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
792						&new->it_interval);
793
794	/*
795	 * This acts as a modification timestamp for the timer,
796	 * so any automatic reload attempt will punt on seeing
797	 * that we have reset the timer manually.
798	 */
799	timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
800		~REQUEUE_PENDING;
801	timer->it_overrun_last = 0;
802	timer->it_overrun = -1;
803
804	if (new_expires.sched != 0 &&
805	    !cpu_time_before(timer->it_clock, val, new_expires)) {
806		/*
807		 * The designated time already passed, so we notify
808		 * immediately, even if the thread never runs to
809		 * accumulate more time on this clock.
810		 */
811		cpu_timer_fire(timer);
812	}
813
814	ret = 0;
815 out:
816	if (old) {
817		sample_to_timespec(timer->it_clock,
818				   old_incr, &old->it_interval);
819	}
820	return ret;
821}
822
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{
825	union cpu_time_count now;
826	struct task_struct *p = timer->it.cpu.task;
827	int clear_dead;
828
829	/*
830	 * Easy part: convert the reload time.
831	 */
832	sample_to_timespec(timer->it_clock,
833			   timer->it.cpu.incr, &itp->it_interval);
834
835	if (timer->it.cpu.expires.sched == 0) {	/* Timer not armed at all.  */
836		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
837		return;
838	}
839
840	if (unlikely(p == NULL)) {
841		/*
842		 * This task already died and the timer will never fire.
843		 * In this case, expires is actually the dead value.
844		 */
845	dead:
846		sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
847				   &itp->it_value);
848		return;
849	}
850
851	/*
852	 * Sample the clock to take the difference with the expiry time.
853	 */
854	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
855		cpu_clock_sample(timer->it_clock, p, &now);
856		clear_dead = p->exit_state;
857	} else {
858		read_lock(&tasklist_lock);
859		if (unlikely(p->sighand == NULL)) {
860			/*
861			 * The process has been reaped.
862			 * We can't even collect a sample any more.
863			 * Call the timer disarmed, nothing else to do.
864			 */
865			put_task_struct(p);
866			timer->it.cpu.task = NULL;
867			timer->it.cpu.expires.sched = 0;
868			read_unlock(&tasklist_lock);
869			goto dead;
870		} else {
871			cpu_timer_sample_group(timer->it_clock, p, &now);
872			clear_dead = (unlikely(p->exit_state) &&
873				      thread_group_empty(p));
874		}
875		read_unlock(&tasklist_lock);
876	}
877
878	if (unlikely(clear_dead)) {
879		/*
880		 * We've noticed that the thread is dead, but
881		 * not yet reaped.  Take this opportunity to
882		 * drop our task ref.
883		 */
884		clear_dead_task(timer, now);
885		goto dead;
886	}
887
888	if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
889		sample_to_timespec(timer->it_clock,
890				   cpu_time_sub(timer->it_clock,
891						timer->it.cpu.expires, now),
892				   &itp->it_value);
893	} else {
894		/*
895		 * The timer should have expired already, but the firing
896		 * hasn't taken place yet.  Say it's just about to expire.
897		 */
898		itp->it_value.tv_nsec = 1;
899		itp->it_value.tv_sec = 0;
900	}
901}
902
903/*
904 * Check for any per-thread CPU timers that have fired and move them off
905 * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
906 * tsk->it_*_expires values to reflect the remaining thread CPU timers.
907 */
908static void check_thread_timers(struct task_struct *tsk,
909				struct list_head *firing)
910{
911	int maxfire;
912	struct list_head *timers = tsk->cpu_timers;
913	struct signal_struct *const sig = tsk->signal;
914	unsigned long soft;
915
916	maxfire = 20;
917	tsk->cputime_expires.prof_exp = cputime_zero;
918	while (!list_empty(timers)) {
919		struct cpu_timer_list *t = list_first_entry(timers,
920						      struct cpu_timer_list,
921						      entry);
922		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
923			tsk->cputime_expires.prof_exp = t->expires.cpu;
924			break;
925		}
926		t->firing = 1;
927		list_move_tail(&t->entry, firing);
928	}
929
930	++timers;
931	maxfire = 20;
932	tsk->cputime_expires.virt_exp = cputime_zero;
933	while (!list_empty(timers)) {
934		struct cpu_timer_list *t = list_first_entry(timers,
935						      struct cpu_timer_list,
936						      entry);
937		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
938			tsk->cputime_expires.virt_exp = t->expires.cpu;
939			break;
940		}
941		t->firing = 1;
942		list_move_tail(&t->entry, firing);
943	}
944
945	++timers;
946	maxfire = 20;
947	tsk->cputime_expires.sched_exp = 0;
948	while (!list_empty(timers)) {
949		struct cpu_timer_list *t = list_first_entry(timers,
950						      struct cpu_timer_list,
951						      entry);
952		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
953			tsk->cputime_expires.sched_exp = t->expires.sched;
954			break;
955		}
956		t->firing = 1;
957		list_move_tail(&t->entry, firing);
958	}
959
960	/*
961	 * Check for the special case thread timers.
962	 */
963	soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
964	if (soft != RLIM_INFINITY) {
965		unsigned long hard =
966			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
967
968		if (hard != RLIM_INFINITY &&
969		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
970			/*
971			 * At the hard limit, we just die.
972			 * No need to calculate anything else now.
973			 */
974			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
975			return;
976		}
977		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
978			/*
979			 * At the soft limit, send a SIGXCPU every second.
980			 */
981			if (soft < hard) {
982				soft += USEC_PER_SEC;
983				sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
984			}
985			printk(KERN_INFO
986				"RT Watchdog Timeout: %s[%d]\n",
987				tsk->comm, task_pid_nr(tsk));
988			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
989		}
990	}
991}
992
993static void stop_process_timers(struct signal_struct *sig)
994{
995	struct thread_group_cputimer *cputimer = &sig->cputimer;
996	unsigned long flags;
997
998	spin_lock_irqsave(&cputimer->lock, flags);
999	cputimer->running = 0;
1000	spin_unlock_irqrestore(&cputimer->lock, flags);
1001}
1002
1003static u32 onecputick;
1004
1005static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1006			     cputime_t *expires, cputime_t cur_time, int signo)
1007{
1008	if (cputime_eq(it->expires, cputime_zero))
1009		return;
1010
1011	if (cputime_ge(cur_time, it->expires)) {
1012		if (!cputime_eq(it->incr, cputime_zero)) {
1013			it->expires = cputime_add(it->expires, it->incr);
1014			it->error += it->incr_error;
1015			if (it->error >= onecputick) {
1016				it->expires = cputime_sub(it->expires,
1017							  cputime_one_jiffy);
1018				it->error -= onecputick;
1019			}
1020		} else {
1021			it->expires = cputime_zero;
1022		}
1023
1024		trace_itimer_expire(signo == SIGPROF ?
1025				    ITIMER_PROF : ITIMER_VIRTUAL,
1026				    tsk->signal->leader_pid, cur_time);
1027		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1028	}
1029
1030	if (!cputime_eq(it->expires, cputime_zero) &&
1031	    (cputime_eq(*expires, cputime_zero) ||
1032	     cputime_lt(it->expires, *expires))) {
1033		*expires = it->expires;
1034	}
1035}
1036
1037/**
1038 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1039 *
1040 * @cputime:	The struct to compare.
1041 *
1042 * Checks @cputime to see if all fields are zero.  Returns true if all fields
1043 * are zero, false if any field is nonzero.
1044 */
1045static inline int task_cputime_zero(const struct task_cputime *cputime)
1046{
1047	if (cputime_eq(cputime->utime, cputime_zero) &&
1048	    cputime_eq(cputime->stime, cputime_zero) &&
1049	    cputime->sum_exec_runtime == 0)
1050		return 1;
1051	return 0;
1052}
1053
1054/*
1055 * Check for any per-thread CPU timers that have fired and move them
1056 * off the tsk->*_timers list onto the firing list.  Per-thread timers
1057 * have already been taken off.
1058 */
1059static void check_process_timers(struct task_struct *tsk,
1060				 struct list_head *firing)
1061{
1062	int maxfire;
1063	struct signal_struct *const sig = tsk->signal;
1064	cputime_t utime, ptime, virt_expires, prof_expires;
1065	unsigned long long sum_sched_runtime, sched_expires;
1066	struct list_head *timers = sig->cpu_timers;
1067	struct task_cputime cputime;
1068	unsigned long soft;
1069
1070	/*
1071	 * Collect the current process totals.
1072	 */
1073	thread_group_cputimer(tsk, &cputime);
1074	utime = cputime.utime;
1075	ptime = cputime_add(utime, cputime.stime);
1076	sum_sched_runtime = cputime.sum_exec_runtime;
1077	maxfire = 20;
1078	prof_expires = cputime_zero;
1079	while (!list_empty(timers)) {
1080		struct cpu_timer_list *tl = list_first_entry(timers,
1081						      struct cpu_timer_list,
1082						      entry);
1083		if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
1084			prof_expires = tl->expires.cpu;
1085			break;
1086		}
1087		tl->firing = 1;
1088		list_move_tail(&tl->entry, firing);
1089	}
1090
1091	++timers;
1092	maxfire = 20;
1093	virt_expires = cputime_zero;
1094	while (!list_empty(timers)) {
1095		struct cpu_timer_list *tl = list_first_entry(timers,
1096						      struct cpu_timer_list,
1097						      entry);
1098		if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
1099			virt_expires = tl->expires.cpu;
1100			break;
1101		}
1102		tl->firing = 1;
1103		list_move_tail(&tl->entry, firing);
1104	}
1105
1106	++timers;
1107	maxfire = 20;
1108	sched_expires = 0;
1109	while (!list_empty(timers)) {
1110		struct cpu_timer_list *tl = list_first_entry(timers,
1111						      struct cpu_timer_list,
1112						      entry);
1113		if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1114			sched_expires = tl->expires.sched;
1115			break;
1116		}
1117		tl->firing = 1;
1118		list_move_tail(&tl->entry, firing);
1119	}
1120
1121	/*
1122	 * Check for the special case process timers.
1123	 */
1124	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1125			 SIGPROF);
1126	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1127			 SIGVTALRM);
1128	soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1129	if (soft != RLIM_INFINITY) {
1130		unsigned long psecs = cputime_to_secs(ptime);
1131		unsigned long hard =
1132			ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1133		cputime_t x;
1134		if (psecs >= hard) {
1135			/*
1136			 * At the hard limit, we just die.
1137			 * No need to calculate anything else now.
1138			 */
1139			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1140			return;
1141		}
1142		if (psecs >= soft) {
1143			/*
1144			 * At the soft limit, send a SIGXCPU every second.
1145			 */
1146			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1147			if (soft < hard) {
1148				soft++;
1149				sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1150			}
1151		}
1152		x = secs_to_cputime(soft);
1153		if (cputime_eq(prof_expires, cputime_zero) ||
1154		    cputime_lt(x, prof_expires)) {
1155			prof_expires = x;
1156		}
1157	}
1158
1159	sig->cputime_expires.prof_exp = prof_expires;
1160	sig->cputime_expires.virt_exp = virt_expires;
1161	sig->cputime_expires.sched_exp = sched_expires;
1162	if (task_cputime_zero(&sig->cputime_expires))
1163		stop_process_timers(sig);
1164}
1165
1166/*
1167 * This is called from the signal code (via do_schedule_next_timer)
1168 * when the last timer signal was delivered and we have to reload the timer.
1169 */
1170void posix_cpu_timer_schedule(struct k_itimer *timer)
1171{
1172	struct task_struct *p = timer->it.cpu.task;
1173	union cpu_time_count now;
1174
1175	if (unlikely(p == NULL))
1176		/*
1177		 * The task was cleaned up already, no future firings.
1178		 */
1179		goto out;
1180
1181	/*
1182	 * Fetch the current sample and update the timer's expiry time.
1183	 */
1184	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1185		cpu_clock_sample(timer->it_clock, p, &now);
1186		bump_cpu_timer(timer, now);
1187		if (unlikely(p->exit_state)) {
1188			clear_dead_task(timer, now);
1189			goto out;
1190		}
1191		read_lock(&tasklist_lock); /* arm_timer needs it.  */
1192		spin_lock(&p->sighand->siglock);
1193	} else {
1194		read_lock(&tasklist_lock);
1195		if (unlikely(p->sighand == NULL)) {
1196			/*
1197			 * The process has been reaped.
1198			 * We can't even collect a sample any more.
1199			 */
1200			put_task_struct(p);
1201			timer->it.cpu.task = p = NULL;
1202			timer->it.cpu.expires.sched = 0;
1203			goto out_unlock;
1204		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1205			/*
1206			 * We've noticed that the thread is dead, but
1207			 * not yet reaped.  Take this opportunity to
1208			 * drop our task ref.
1209			 */
1210			clear_dead_task(timer, now);
1211			goto out_unlock;
1212		}
1213		spin_lock(&p->sighand->siglock);
1214		cpu_timer_sample_group(timer->it_clock, p, &now);
1215		bump_cpu_timer(timer, now);
1216		/* Leave the tasklist_lock locked for the call below.  */
1217	}
1218
1219	/*
1220	 * Now re-arm for the new expiry time.
1221	 */
1222	BUG_ON(!irqs_disabled());
1223	arm_timer(timer);
1224	spin_unlock(&p->sighand->siglock);
1225
1226out_unlock:
1227	read_unlock(&tasklist_lock);
1228
1229out:
1230	timer->it_overrun_last = timer->it_overrun;
1231	timer->it_overrun = -1;
1232	++timer->it_requeue_pending;
1233}
1234
1235/**
1236 * task_cputime_expired - Compare two task_cputime entities.
1237 *
1238 * @sample:	The task_cputime structure to be checked for expiration.
1239 * @expires:	Expiration times, against which @sample will be checked.
1240 *
1241 * Checks @sample against @expires to see if any field of @sample has expired.
1242 * Returns true if any field of the former is greater than the corresponding
1243 * field of the latter if the latter field is set.  Otherwise returns false.
1244 */
1245static inline int task_cputime_expired(const struct task_cputime *sample,
1246					const struct task_cputime *expires)
1247{
1248	if (!cputime_eq(expires->utime, cputime_zero) &&
1249	    cputime_ge(sample->utime, expires->utime))
1250		return 1;
1251	if (!cputime_eq(expires->stime, cputime_zero) &&
1252	    cputime_ge(cputime_add(sample->utime, sample->stime),
1253		       expires->stime))
1254		return 1;
1255	if (expires->sum_exec_runtime != 0 &&
1256	    sample->sum_exec_runtime >= expires->sum_exec_runtime)
1257		return 1;
1258	return 0;
1259}
1260
1261/**
1262 * fastpath_timer_check - POSIX CPU timers fast path.
1263 *
1264 * @tsk:	The task (thread) being checked.
1265 *
1266 * Check the task and thread group timers.  If both are zero (there are no
1267 * timers set) return false.  Otherwise snapshot the task and thread group
1268 * timers and compare them with the corresponding expiration times.  Return
1269 * true if a timer has expired, else return false.
1270 */
1271static inline int fastpath_timer_check(struct task_struct *tsk)
1272{
1273	struct signal_struct *sig;
1274
1275	if (!task_cputime_zero(&tsk->cputime_expires)) {
1276		struct task_cputime task_sample = {
1277			.utime = tsk->utime,
1278			.stime = tsk->stime,
1279			.sum_exec_runtime = tsk->se.sum_exec_runtime
1280		};
1281
1282		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1283			return 1;
1284	}
1285
1286	sig = tsk->signal;
1287	if (sig->cputimer.running) {
1288		struct task_cputime group_sample;
1289
1290		spin_lock(&sig->cputimer.lock);
1291		group_sample = sig->cputimer.cputime;
1292		spin_unlock(&sig->cputimer.lock);
1293
1294		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1295			return 1;
1296	}
1297
1298	return 0;
1299}
1300
1301/*
1302 * This is called from the timer interrupt handler.  The irq handler has
1303 * already updated our counts.  We need to check if any timers fire now.
1304 * Interrupts are disabled.
1305 */
1306void run_posix_cpu_timers(struct task_struct *tsk)
1307{
1308	LIST_HEAD(firing);
1309	struct k_itimer *timer, *next;
1310	unsigned long flags;
1311
1312	BUG_ON(!irqs_disabled());
1313
1314	/*
1315	 * The fast path checks that there are no expired thread or thread
1316	 * group timers.  If that's so, just return.
1317	 */
1318	if (!fastpath_timer_check(tsk))
1319		return;
1320
1321	if (!lock_task_sighand(tsk, &flags))
1322		return;
1323	/*
1324	 * Here we take off tsk->signal->cpu_timers[N] and
1325	 * tsk->cpu_timers[N] all the timers that are firing, and
1326	 * put them on the firing list.
1327	 */
1328	check_thread_timers(tsk, &firing);
1329	/*
1330	 * If there are any active process wide timers (POSIX 1.b, itimers,
1331	 * RLIMIT_CPU) cputimer must be running.
1332	 */
1333	if (tsk->signal->cputimer.running)
1334		check_process_timers(tsk, &firing);
1335
1336	/*
1337	 * We must release these locks before taking any timer's lock.
1338	 * There is a potential race with timer deletion here, as the
1339	 * siglock now protects our private firing list.  We have set
1340	 * the firing flag in each timer, so that a deletion attempt
1341	 * that gets the timer lock before we do will give it up and
1342	 * spin until we've taken care of that timer below.
1343	 */
1344	unlock_task_sighand(tsk, &flags);
1345
1346	/*
1347	 * Now that all the timers on our list have the firing flag,
1348	 * noone will touch their list entries but us.  We'll take
1349	 * each timer's lock before clearing its firing flag, so no
1350	 * timer call will interfere.
1351	 */
1352	list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
1353		int cpu_firing;
1354
1355		spin_lock(&timer->it_lock);
1356		list_del_init(&timer->it.cpu.entry);
1357		cpu_firing = timer->it.cpu.firing;
1358		timer->it.cpu.firing = 0;
1359		/*
1360		 * The firing flag is -1 if we collided with a reset
1361		 * of the timer, which already reported this
1362		 * almost-firing as an overrun.  So don't generate an event.
1363		 */
1364		if (likely(cpu_firing >= 0))
1365			cpu_timer_fire(timer);
1366		spin_unlock(&timer->it_lock);
1367	}
1368}
1369
1370/*
1371 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1372 * The tsk->sighand->siglock must be held by the caller.
1373 */
1374void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1375			   cputime_t *newval, cputime_t *oldval)
1376{
1377	union cpu_time_count now;
1378
1379	BUG_ON(clock_idx == CPUCLOCK_SCHED);
1380	cpu_timer_sample_group(clock_idx, tsk, &now);
1381
1382	if (oldval) {
1383		/*
1384		 * We are setting itimer. The *oldval is absolute and we update
1385		 * it to be relative, *newval argument is relative and we update
1386		 * it to be absolute.
1387		 */
1388		if (!cputime_eq(*oldval, cputime_zero)) {
1389			if (cputime_le(*oldval, now.cpu)) {
1390				/* Just about to fire. */
1391				*oldval = cputime_one_jiffy;
1392			} else {
1393				*oldval = cputime_sub(*oldval, now.cpu);
1394			}
1395		}
1396
1397		if (cputime_eq(*newval, cputime_zero))
1398			return;
1399		*newval = cputime_add(*newval, now.cpu);
1400	}
1401
1402	/*
1403	 * Update expiration cache if we are the earliest timer, or eventually
1404	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1405	 */
1406	switch (clock_idx) {
1407	case CPUCLOCK_PROF:
1408		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1409			tsk->signal->cputime_expires.prof_exp = *newval;
1410		break;
1411	case CPUCLOCK_VIRT:
1412		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1413			tsk->signal->cputime_expires.virt_exp = *newval;
1414		break;
1415	}
1416}
1417
1418static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1419			    struct timespec *rqtp, struct itimerspec *it)
1420{
1421	struct k_itimer timer;
1422	int error;
1423
1424	/*
1425	 * Set up a temporary timer and then wait for it to go off.
1426	 */
1427	memset(&timer, 0, sizeof timer);
1428	spin_lock_init(&timer.it_lock);
1429	timer.it_clock = which_clock;
1430	timer.it_overrun = -1;
1431	error = posix_cpu_timer_create(&timer);
1432	timer.it_process = current;
1433	if (!error) {
1434		static struct itimerspec zero_it;
1435
1436		memset(it, 0, sizeof *it);
1437		it->it_value = *rqtp;
1438
1439		spin_lock_irq(&timer.it_lock);
1440		error = posix_cpu_timer_set(&timer, flags, it, NULL);
1441		if (error) {
1442			spin_unlock_irq(&timer.it_lock);
1443			return error;
1444		}
1445
1446		while (!signal_pending(current)) {
1447			if (timer.it.cpu.expires.sched == 0) {
1448				/*
1449				 * Our timer fired and was reset.
1450				 */
1451				spin_unlock_irq(&timer.it_lock);
1452				return 0;
1453			}
1454
1455			/*
1456			 * Block until cpu_timer_fire (or a signal) wakes us.
1457			 */
1458			__set_current_state(TASK_INTERRUPTIBLE);
1459			spin_unlock_irq(&timer.it_lock);
1460			schedule();
1461			spin_lock_irq(&timer.it_lock);
1462		}
1463
1464		/*
1465		 * We were interrupted by a signal.
1466		 */
1467		sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1468		posix_cpu_timer_set(&timer, 0, &zero_it, it);
1469		spin_unlock_irq(&timer.it_lock);
1470
1471		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1472			/*
1473			 * It actually did fire already.
1474			 */
1475			return 0;
1476		}
1477
1478		error = -ERESTART_RESTARTBLOCK;
1479	}
1480
1481	return error;
1482}
1483
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1485		     struct timespec *rqtp, struct timespec __user *rmtp)
1486{
1487	struct restart_block *restart_block =
1488	    &current_thread_info()->restart_block;
1489	struct itimerspec it;
1490	int error;
1491
1492	/*
1493	 * Diagnose required errors first.
1494	 */
1495	if (CPUCLOCK_PERTHREAD(which_clock) &&
1496	    (CPUCLOCK_PID(which_clock) == 0 ||
1497	     CPUCLOCK_PID(which_clock) == current->pid))
1498		return -EINVAL;
1499
1500	error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
1501
1502	if (error == -ERESTART_RESTARTBLOCK) {
1503
1504	       	if (flags & TIMER_ABSTIME)
1505			return -ERESTARTNOHAND;
1506		/*
1507	 	 * Report back to the user the time still remaining.
1508	 	 */
1509		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510			return -EFAULT;
1511
1512		restart_block->fn = posix_cpu_nsleep_restart;
1513		restart_block->arg0 = which_clock;
1514		restart_block->arg1 = (unsigned long) rmtp;
1515		restart_block->arg2 = rqtp->tv_sec;
1516		restart_block->arg3 = rqtp->tv_nsec;
1517	}
1518	return error;
1519}
1520
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{
1523	clockid_t which_clock = restart_block->arg0;
1524	struct timespec __user *rmtp;
1525	struct timespec t;
1526	struct itimerspec it;
1527	int error;
1528
1529	rmtp = (struct timespec __user *) restart_block->arg1;
1530	t.tv_sec = restart_block->arg2;
1531	t.tv_nsec = restart_block->arg3;
1532
1533	restart_block->fn = do_no_restart_syscall;
1534	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535
1536	if (error == -ERESTART_RESTARTBLOCK) {
1537		/*
1538	 	 * Report back to the user the time still remaining.
1539	 	 */
1540		if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541			return -EFAULT;
1542
1543		restart_block->fn = posix_cpu_nsleep_restart;
1544		restart_block->arg0 = which_clock;
1545		restart_block->arg1 = (unsigned long) rmtp;
1546		restart_block->arg2 = t.tv_sec;
1547		restart_block->arg3 = t.tv_nsec;
1548	}
1549	return error;
1550
1551}
1552
1553
1554#define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556
1557static int process_cpu_clock_getres(const clockid_t which_clock,
1558				    struct timespec *tp)
1559{
1560	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1561}
1562static int process_cpu_clock_get(const clockid_t which_clock,
1563				 struct timespec *tp)
1564{
1565	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1566}
1567static int process_cpu_timer_create(struct k_itimer *timer)
1568{
1569	timer->it_clock = PROCESS_CLOCK;
1570	return posix_cpu_timer_create(timer);
1571}
1572static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1573			      struct timespec *rqtp,
1574			      struct timespec __user *rmtp)
1575{
1576	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1577}
1578static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1579{
1580	return -EINVAL;
1581}
1582static int thread_cpu_clock_getres(const clockid_t which_clock,
1583				   struct timespec *tp)
1584{
1585	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1586}
1587static int thread_cpu_clock_get(const clockid_t which_clock,
1588				struct timespec *tp)
1589{
1590	return posix_cpu_clock_get(THREAD_CLOCK, tp);
1591}
1592static int thread_cpu_timer_create(struct k_itimer *timer)
1593{
1594	timer->it_clock = THREAD_CLOCK;
1595	return posix_cpu_timer_create(timer);
1596}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
1598			      struct timespec *rqtp, struct timespec __user *rmtp)
1599{
1600	return -EINVAL;
1601}
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
1603{
1604	return -EINVAL;
1605}
1606
1607static __init int init_posix_cpu_timers(void)
1608{
1609	struct k_clock process = {
1610		.clock_getres = process_cpu_clock_getres,
1611		.clock_get = process_cpu_clock_get,
1612		.clock_set = do_posix_clock_nosettime,
1613		.timer_create = process_cpu_timer_create,
1614		.nsleep = process_cpu_nsleep,
1615		.nsleep_restart = process_cpu_nsleep_restart,
1616	};
1617	struct k_clock thread = {
1618		.clock_getres = thread_cpu_clock_getres,
1619		.clock_get = thread_cpu_clock_get,
1620		.clock_set = do_posix_clock_nosettime,
1621		.timer_create = thread_cpu_timer_create,
1622		.nsleep = thread_cpu_nsleep,
1623		.nsleep_restart = thread_cpu_nsleep_restart,
1624	};
1625	struct timespec ts;
1626
1627	register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628	register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629
1630	cputime_to_timespec(cputime_one_jiffy, &ts);
1631	onecputick = ts.tv_nsec;
1632	WARN_ON(ts.tv_sec != 0);
1633
1634	return 0;
1635}
1636__initcall(init_posix_cpu_timers);
1637