Edit File: ppm_cputime.c
// SPDX-License-Identifier: GPL-2.0-only OR MIT /* Copyright (C) 2023 The Falco Authors. This file is dual licensed under either the MIT or GPL 2. See MIT.txt or GPL2.txt for full copies of the license. */ #include <linux/version.h> // These function are taken from the linux kernel and are used only // on versions that don't export task_cputime_adjusted() #if(LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) #if(LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 37)) #include <asm/atomic.h> #else #include <linux/atomic.h> #endif #include <linux/module.h> #include <linux/kernel.h> #include <linux/kdev_t.h> #include <linux/delay.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/tracepoint.h> #include <net/sock.h> #include <asm/unistd.h> #include "ppm_ringbuffer.h" #include "ppm_events_public.h" #include "ppm_events.h" #include "ppm.h" #include "ppm_version.h" #if(defined CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) || (LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 30)) void ppm_task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } #else #ifndef cmpxchg_cputime #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN #if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)) || \ (PPM_RHEL_RELEASE_CODE > 0 && PPM_RHEL_RELEASE_CODE >= PPM_RHEL_RELEASE_VERSION(7, 7)) #define ppm_vtime_starttime(tsk) ((tsk)->vtime.starttime) #define ppm_vtime_seqlock(tsk) (&(tsk)->vtime.seqlock) #define ppm_vtime_state(tsk) ((tsk)->vtime.state) #else #define ppm_vtime_starttime(tsk) ((tsk)->vtime_snap) #define ppm_vtime_seqlock(tsk) (&(tsk)->vtime_seqlock) #define ppm_vtime_state(tsk) ((tsk)->vtime_snap_whence) #endif static unsigned long long vtime_delta(struct task_struct *tsk) { unsigned long long clock; clock = local_clock(); if(clock < ppm_vtime_starttime(tsk)) return 0; return clock - ppm_vtime_starttime(tsk); } static void fetch_task_cputime(struct task_struct *t, cputime_t *u_dst, cputime_t *s_dst, cputime_t *u_src, cputime_t *s_src, cputime_t *udelta, cputime_t *sdelta) { unsigned int seq; unsigned long long delta; do { *udelta = 0; *sdelta = 0; seq = read_seqbegin(ppm_vtime_seqlock(t)); if(u_dst) *u_dst = *u_src; if(s_dst) *s_dst = *s_src; /* Task is sleeping, nothing to add */ if(ppm_vtime_state(t) == VTIME_SLEEPING || is_idle_task(t)) continue; delta = vtime_delta(t); /* * Task runs either in user or kernel space, add pending nohz time to * the right place. */ if(ppm_vtime_state(t) == VTIME_USER || t->flags & PF_VCPU) { *udelta = delta; } else { if(ppm_vtime_state(t) == VTIME_SYS) *sdelta = delta; } } while(read_seqretry(ppm_vtime_seqlock(t), seq)); } void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { cputime_t udelta, sdelta; fetch_task_cputime(t, utime, stime, &t->utime, &t->stime, &udelta, &sdelta); if(utime) *utime += udelta; if(stime) *stime += sdelta; } #elif LINUX_VERSION_CODE < KERNEL_VERSION(3, 9, 0) static inline void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { if(utime) *utime = t->utime; if(stime) *stime = t->stime; } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ uint64_t nsecs_to_jiffies64(uint64_t n) { #if(NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ return div_u64(n, NSEC_PER_SEC / HZ); #elif(HZ % 512) == 0 /* overflow after 292 years if HZ = 1024 */ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); #else /* * Generic case - optimized for cases where HZ is a multiple of 3. * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. */ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } unsigned long nsecs_to_jiffies(uint64_t n) { return (unsigned long)nsecs_to_jiffies64(n); } #ifndef nsecs_to_cputime #ifdef msecs_to_cputime #define nsecs_to_cputime(__nsecs) msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) #else #define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif #endif #if(LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)) /* * Perform (stime * rtime) / total, but avoid multiplication overflow by * loosing precision when the numbers are big. */ static cputime_t scale_stime(uint64_t stime, uint64_t rtime, uint64_t total) { uint64_t scaled; for(;;) { /* Make sure "rtime" is the bigger of stime/rtime */ if(stime > rtime) swap(rtime, stime); /* Make sure 'total' fits in 32 bits */ if(total >> 32) goto drop_precision; /* Does rtime (and thus stime) fit in 32 bits? */ if(!(rtime >> 32)) break; /* Can we just balance rtime/stime rather than dropping bits? */ if(stime >> 31) goto drop_precision; /* We can grow stime and shrink rtime and try to make them both fit */ stime <<= 1; rtime >>= 1; continue; drop_precision: /* We drop from rtime, it has more bits than stime */ rtime >>= 1; total >>= 1; } /* * Make sure gcc understands that this is a 32x32->64 multiply, * followed by a 64/32->64 divide. */ scaled = div_u64((uint64_t)(uint32_t)stime * (uint64_t)(uint32_t)rtime, (uint32_t)total); return (__force cputime_t)scaled; } /* * Atomically advance counter to the new value. Interrupts, vcpu * scheduling, and scaling inaccuracies can cause cputime_advance * to be occasionally called with a new value smaller than counter. * Let's enforce atomicity. * * Normally a caller will only go through this loop once, or not * at all in case a previous caller updated counter the same jiffy. */ static void cputime_advance(cputime_t *counter, cputime_t new) { cputime_t old; while(new > (old = ACCESS_ONCE(*counter))) cmpxchg_cputime(counter, old, new); } /* * Adjust tick based cputime random precision against scheduler * runtime accounting. */ static void cputime_adjust(struct task_cputime *curr, #if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)) struct prev_cputime *prev, #else struct cputime *prev, #endif cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime; /* * Tick based cputime accounting depend on random scheduling * timeslices of a task to be interrupted or not by the timer. * Depending on these circumstances, the number of these interrupts * may be over or under-optimistic, matching the real user and system * cputime with a variable precision. * * Fix this by scaling these tick based values against the total * runtime accounted by the CFS scheduler. */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); /* * Update userspace visible utime/stime values only if actual execution * time is bigger than already exported. Note that can happen, that we * provided bigger values due to scaling inaccuracy on big numbers. */ if(prev->stime + prev->utime >= rtime) goto out; stime = curr->stime; utime = curr->utime; if(utime == 0) { stime = rtime; } else if(stime == 0) { utime = rtime; } else { cputime_t total = stime + utime; stime = scale_stime((__force uint64_t)stime, (__force uint64_t)rtime, (__force uint64_t)total); utime = rtime - stime; } cputime_advance(&prev->stime, stime); cputime_advance(&prev->utime, utime); out: *ut = prev->utime; *st = prev->stime; } void ppm_task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime = { #ifdef CONFIG_SCHED_BFS .sum_exec_runtime = tsk_seruntime(p), #else .sum_exec_runtime = p->se.sum_exec_runtime, #endif }; task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } #else /* LINUX_VERSION_CODE < KERNEL_VERSION(3, 8, 0) */ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) { uint64_t temp = (__force uint64_t)rtime; temp *= (__force uint64_t)utime; if(sizeof(cputime_t) == 4) temp = div_u64(temp, (__force uint32_t)total); else temp = div64_u64(temp, (__force uint64_t)total); return (__force cputime_t)temp; } // Taken from task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) void ppm_task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; /* * Use CFS's precise accounting: */ rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if(total) utime = scale_utime(utime, rtime, total); else utime = rtime; /* * Compare with previous values, to keep monotonicity: */ p->prev_utime = max(p->prev_utime, utime); p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); *ut = p->prev_utime; *st = p->prev_stime; } #endif /* (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)) */ #endif /* (defined CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) || (LINUX_VERSION_CODE < KERNEL_VERSION(2, \ 6, 30)) */ #endif /* (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)) */ #if(LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)) #include <linux/jiffies.h> #include <linux/param.h> /* * Implementation copied from kernel/time/time.c in 4.11.0 */ uint64_t nsec_to_clock_t(uint64_t x) { #if(NSEC_PER_SEC % USER_HZ) == 0 return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif(USER_HZ % 512) == 0 return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); #else /* * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, * overflow after 64.99 years * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... */ return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); #endif } #endif /* (LINUX_VERSION_CODE < KERNEL_VERSION(4, 11, 0)) */