#!/usr/bin/perl -w

# Zzztop.pl
#
# Copyright 2013, Phil Carmody
# Inspired by "PowerTOP", but renamed as it's more concerned with
# sleeping than it is with the actual power consumed. Detects tasks
# that prevent the system from sleeping, in particular those that
# poll files or incorrectly poll() them, and ones that play ping-pong
# with each other.


# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details. This can be found
# at <http://www.gnu.org/licenses/>.


use strict qw(refs vars subs);
use warnings;

my $verbose=1;
my $all=0; # show all frequencies (etc.?) even if data is 0
my $incompat=0; # more accuracy then maemo/meego's powertop means different results

# Need to select which timer to use. Time::HiRes may not be available
my ($xtime,$calibrate);
eval { require Time::HiRes; };
if(!$@) { $xtime = \&Time::HiRes::time; }
else {
    print STDERR "WARNING: Only low accuracy timing available.\n";
    sub xtime {
	open(S, "</proc/stat");
	<S>; $_=<S>;
	close(S);
	my @a=split(/\s+/);
	return ($a[1]+$a[2]+$a[3]+$a[4]+$a[5]+$a[6]+$a[7])/100;
    }
    $xtime=\&xtime;
    $calibrate=1; 
}

# Not every check is appropriate, depending on platform
my ($do_cpuidle, $do_cpufreq, $do_interrupts, $do_timerstats, $do_ctxtsw)=(1,1,1,1,1);

sub usage()
{
    print<<HELP
ZzzTop: show wakeup reasons, and other PM-related CPU info
Usage:
zzztop [-h] [-s=<nnn>] [-t=<nnn>] [-a]
  -h       this help text
  -s=<nnn> sleep for <nnn> seconds before collecting data
  -t=<nnn> collect data for <nnn> seconds
  -a       for some data gathered (cpufreq), show even 0 records
HELP
}

my ($sleep,$time)=(10,30);
while($_=shift(@ARGV)) {
    if(m/-s=(\d+)/) { $sleep=int($1); }
    elsif(m/-t=(\d+)/) { $time=int($1); }
    elsif(m/-a/) { $all=1; }
    elsif(m/-h/) { usage(); exit; }
    else { unshift(@ARGV, $_); last; }
}

my $mtime;
# gmtime(); # flush any one-time-only date prep now before the critical loop

my $cpudir='/sys/devices/system/cpu';

sub headstrip($)
{
    open(F, "<", $_[0]) or die("failed to read $_[0]");
    $_=<F>;
    close(F);
    chomp if($_);
    $_;
}

# BEGIN cpu
sub get_cpu_range()
{
    my $c=-1;
    my @cpus=glob("$cpudir/cpu[0-9]");
    foreach my $cpuname (@cpus) {
	my ($cpu)=($cpuname=~m/(\d+)$/);
	if($cpu>$c) { $c=$cpu; }
    }
    print("Detected ".($c+1)." cpus\n")
	if($verbose);
    $c;
}

my $maxcpu=0;
$maxcpu = get_cpu_range();
if($maxcpu<0) {
    $do_cpuidle=0;
    print STDERR "All cpu-specific data gathering disabled - no $cpudir/cpu*\n";
}
# END cpu

# BEGIN cpuidle
sub get_cpuidle_ranges($)
{
    my $ret=$_[0];
    my $s=-1;
    foreach my $cpu (0..$maxcpu) {
	my @states=glob("$cpudir/cpu$cpu/cpuidle/state[0-9]*");
	my $t=-1;
	foreach my $state (@states) {
	    my ($statenum)=($state =~ m/(\d+)$/);
	    if($statenum>$t) { $t=$statenum; }
	}
	if($t>=0) { $ret->[$cpu]=$t; }
	if($t>$s) { $s=$t; }
    }
    print("Detected ".($s+1)." cpuidle states (@$ret)\n")
	if($verbose);
    $s;
}

my @maxstate;
my $maxstate = get_cpuidle_ranges(\@maxstate);
if($#maxstate<0) {
    $do_cpuidle=0;
    print STDERR "cpuidle disabled - no $cpudir/cpu*/cpuidle trees\n";
    print STDERR "                   enable CPU_IDLE in kernel config\n"
	if($verbose);
}

sub get_cpuidle_data() {
    my @ret=();
    foreach my $cpu (0..$maxcpu) {
	$ret[$cpu]=[];
	foreach my $state (0..$maxstate[$cpu]) {
	    my $time=headstrip("$cpudir/cpu$cpu/cpuidle/state$state/time");
	    my $usage=headstrip("$cpudir/cpu$cpu/cpuidle/state$state/usage");
	    $ret[$cpu]->[$state]=[$time,$usage];
	}
    }
    @ret;
}

sub diff_cpuidle($$)
{
    my ($pre, $post)=@_;
    print("C-state Information\n");
    print("===================\n");
    
    my @totalidle=();
    my $usagemax=0;
    foreach my $cpu (0..$maxcpu) {
	foreach my $state (0..$maxstate[$cpu]) {
	    my $dt = $post->[$cpu][$state][0] - $pre->[$cpu][$state][0];
	    my $du = $post->[$cpu][$state][1] - $pre->[$cpu][$state][1];
	    $totalidle[$cpu] += $dt;
	    my $residence = $du ? int($dt/$du) : 0;
	    if($residence > $usagemax) { $usagemax=$residence; }
	}
    }
    my ($usagescale, $usageletter)=(1,'u');
    if($usagemax>99999) { $usagescale=1000; $usageletter='m'; }

    my ($h1,$h2,$h3);
    $h1="    |";
    $h2=" C# |";
    $h3="----+";
    foreach my $cpu (0..$maxcpu) {
	$h1.="      CPU#$cpu      |";
	$h2.="  time  | avg/${usageletter}s |";
	$h3.="--------+--------+";
    }
    print("$h1\n$h2\n$h3\n");

    printf(" C%u |", 0); # active is the fake state C0
    foreach my $cpu (0..$maxcpu) {
	printf(" %5.1f%% |        |", 100-$totalidle[$cpu]/($mtime*10000));
    }
    print("\n");
    foreach my $state (0..$maxstate) {
	printf(" C%u |", $state+1);
	foreach my $cpu (0..$maxcpu) {
	    if($state<=$maxstate[$cpu]) {
		my $dt = $post->[$cpu][$state][0] - $pre->[$cpu][$state][0];
		my $du = $post->[$cpu][$state][1] - $pre->[$cpu][$state][1];
		# times in us, so divide by 1000000, but want % so use 10000.
		# usage is number of transition
		printf(" %5.1f%% |  % 5s |", 
		       $dt/($mtime*10000),
		       $du ? int($dt/$du/$usagescale) : "");
	    } else {
		printf("        |        |");
	    }
	}
	print("\n");
    }
    #With the fake C0 record for active, there's no need for the idle sum too
    #print("Idle\t|");
    #foreach my $cpu (0..$maxcpu) {
    #	printf(" %4.1f%%\t|\t|", $totalidle[$cpu]/($mtime*10000));
    #}
    #print("\n");
    print("\n");
}
# END cpuidle

# BEGIN cpufreq
sub get_cpufreq_ranges()
{
    return -r "$cpudir/cpu0/cpufreq/stats/time_in_state" ? 0 : -1;
}

if(get_cpufreq_ranges()<0) {
    $do_cpufreq = 0;
    print STDERR "cpufreq disabled - no $cpudir/cpu*/cpufreq/stats/time_in_state\n";
    print STDERR "                   enable CPU_FREQ_STAT in kernel config\n"
	if($verbose);
}

sub get_cpufreq_data() {
    my @ret=();
    foreach my $cpu (0..$maxcpu) {
	$ret[$cpu]={};
	open(F, "<$cpudir/cpu$cpu/cpufreq/stats/time_in_state") or die("cpufreq: 'read $cpudir/cpu$cpu/cpufreq/stats/time_in_state' $@");
	while(<F>) {
	    if(m/(\d+)\s+(\d+)/) { $ret[$cpu]->{$1} = $2; }
	    else { die("WTF is this doing in cpufreq: $_"); }
	}
	close(F);
	# print("cpu$cpu -> ", join(" ",keys(%{$ret[$cpu]})), "\n");
    }
    @ret;
}

sub diff_cpufreq($$)
{
    my ($pre, $post)=(@_);
    my %okfreq=();
    print("CPUfreq statistics\n");
    print("==================\n");
    my @totalticks=();
    my @freqs = sort { $a<=>$b } keys(%{$pre->[0]});
    foreach my $cpu (0..$maxcpu) {
	$totalticks[$cpu]=0;
	foreach my $freq (@freqs) {
	    my $dt = $post->[$cpu]->{$freq} - $pre->[$cpu]->{$freq};
	    $totalticks[$cpu] += $dt;
	    if($dt) { $okfreq{$freq}++; }
	}
    }

    my $h1="Frequency |";
    my $h2="----------+";
    foreach my $cpu (0..$maxcpu) {
	$h1.="  CPU#$cpu |";
	$h2.="--------+";
    }
    print("$h1\n$h2\n");
    
    foreach my $freq (@freqs) {
	if(!$all and !$okfreq{$freq}) { next; }
	printf(" %4u MHz |", int($freq/1000));
	foreach my $cpu (0..$maxcpu) {
	    my $dt = $post->[$cpu]->{$freq} - $pre->[$cpu]->{$freq};
	    printf(" %5.1f%% |", $dt*100/$totalticks[$cpu]);
	}
	print("\n");
    }
    print("\n");
}
	    
# END cpufreq

# BEGIN interrupts
sub get_interrupt_names()
{
    open(I, "</proc/interrupts") or return undef;
    my @ret=();
    while(<I>) {
	if(m/^\s+CPU\d/) { next; }
	elsif(m/\s*(\w+):\s+\d/) { push(@ret, $1); }
	else { print("interrupts: what's: $_"); }
    }
    close(I);
    @ret;
}

my @interrupt_names=get_interrupt_names();
if(scalar(@interrupt_names)<=0) {
    $do_interrupts = 0;
    print STDERR "interrupts disabled - no /proc/interrupts\n";
}

sub get_interrupt_data() {
    my @ret=();
    open(I, "</proc/interrupts");
    while(<I>) {
	if(m/^\s+CPU\d/) {
	    foreach my $cpu (0..$maxcpu+1) { $ret[$cpu]={}; }
	    next;
	} elsif(s/\s*([\w]+):\s+//) {
	    my $int=$1;
	    foreach my $cpu (0..$maxcpu) {
		s/(\d+)\s+//;
		if($maxcpu>0 && !length($_)) { last; }
		$ret[$cpu]->{$int}=$1; 
	    }
	    chomp;
	    $ret[$maxcpu+1]->{$int} = $_;
	}
	else { print("interrupts: what's: $_"); }
    }
    close(I);
    @ret;
}

sub diff_interrupts($$)
{
    my ($pre, $post)=(@_);
    my %okirq=();
    my @output=();
    my @activity=();
    my $total=0;

    foreach my $int (@interrupt_names) {
	if($maxcpu>0 and !exists($pre->[1]->{$int})) { next; } # ERR or MIS, cpu-less
	my $line=sprintf("%4s |", $int);
	my $doit=0;
	foreach my $cpu (0..$maxcpu) {
	    my $dt = $post->[$cpu]->{$int} - $pre->[$cpu]->{$int};
	    $line.=sprintf(" % 5u |", $dt);
	    $doit+=$dt;
	}
	if($doit) { 
	    push(@output, "$line $pre->[$maxcpu+1]->{$int}\n"); 
	    push(@activity, $doit); 
	    $total+=$doit;
	}
    }

    print("Interrupt statistics\n");
    print("====================\n");

    my $h1=" INT |";
    my $h2="-----+";
    foreach my $cpu (0..$maxcpu) {
	$h1.=" CPU#$cpu |";
	$h2.="-------+";
    }
    print("$h1\n$h2\n");
    my @indices = sort { $activity[$b]<=>$activity[$a]; } (0..$#activity);
    foreach(@indices) { print($output[$_]); }
    printf("Summary: %3.1f interrupts/s total\n", $total/$mtime);
    print("\n");
}
# END interrupts

# BEGIN timerstats
sub get_timerstats()
{
    open(T, ">/proc/timer_stats") or return -1;
    print T "0\n";
    close(T);
    0;
}

if(get_timerstats()<0) {
    $do_timerstats = 0;
    print STDERR "timerstats disabled - no writeable /proc/timer_stats\n";
    if($> != 0) { print STDERR "                      you need to be root!\n"; }
    elsif($verbose) {
	print STDERR "                      enable TIMER_STATS in kernel config\n";
    }
}

sub get_timerstats_data($) # 1 then 0
{
    my $stop=$_[0];
    open(T, ">/proc/timer_stats") or die("timerstats: open() failed: $@");
    print T (($stop?1:0),"\n");
    close(T);
}

sub diff_timerstats()
{
    my $ok=1;
    my @output=();
    my @activity=();
    open(T, "</proc/timer_stats") or die("timerstats: open() failed: $@");
    while(<T>) {
	if(m/Timer Stats Version/) { next; }
	if(m/Sample period: ([\d.]+)\s*s/) { if($1 eq "0.000") { $ok=0; last; } }
	if(m/(\d+) total events/) { if($1 eq "0") { $ok=0; last; } }
	if(m/\s+(\d+)(D?),\s+(\d+)\s(\S+)\s+(.*)$/) {
	    push(@output, sprintf(" % 5s | % 8s%1s| % 15s | %s\n",
				  $3, $1, $2, $4, $5));
	    push(@activity, $1+($3*10e-9)); # decimals keep tasks together ;-)
	}
    }
    close(T);
    if(!$ok) {
	print("Timerstats was unable to gather any data from /proc/timer_stats\n\n");
	return;
    }
    my @indices = sort { $activity[$b]<=>$activity[$a]; } (0..$#activity);
    print("Timer statistics\n");
    print("================\n");
    print("   PID | Activity |     task's comm | function\n");
    print("-------+----------+-----------------+---------\n");
    foreach(@indices) { print($output[$_]); }
    print("\n");
}
# END timerstats

my %taskinfo=();
my $taskinfodone=0;
sub get_task_info($)
{
    my $cmd=headstrip("/proc/$_[0]/cmdline");
    if($cmd) {
	$cmd=~tr/\0/ /;
    }
    $cmd;
}
sub get_tasks_info($) {
    if(!$_[0] and $taskinfodone) { return; }
    for my $p (glob("/proc/[0-9]*")) {
	my ($pid)=($p=~m@/proc/(\d+)@);
	$taskinfo{$pid}=get_task_info($pid);
    }
}

# BEGIN context switch stats
sub get_ctxtsw_init()
{
    get_tasks_info(0);
    0;
}
if(get_ctxtsw_init() < 0) {
    $do_ctxtsw = 0;
    print STDERR "context_switches disabled - this is unexpected\n";
}

sub get_ctxtsw_data()
{
    my %ret=();
    for my $p (glob("/proc/[0-9]*")) {
	open(T, "$p/status") or next;
	my ($pid)=($p=~m@/proc/(\d+)@);
	my @v=();
	my $luserspace=0;
	my ($name)=(<T>=~/Name:\s+(\S+)/);
	while(<T>) {
	    if(m/^(Vm|State:\s+Z)/) { $luserspace=1; }
	    elsif(m/(\S*)voluntary_ctxt_switches:\s+(\d+)/) {
		$v[length($1)?1:0] = $2;
	    }
	}
	if(!defined($taskinfo{$pid})) {
	    $taskinfo{$pid} = $luserspace ? get_task_info($pid) : "<$name>";
	}
	close(T);
	$ret{$pid}=\@v;
    }
    %ret;
}
sub diff_ctxtsw($$)
{
    my ($pre, $post)=@_;
    print("Context switches per task\n",
	  "=========================\n");
    print("  PID  | vol'try | non-vol | Cmdline\n",
	  "-------+---------+---------+--------\n");
    my @postpids=sort { $b<=>$a; } (keys(%$post));
    foreach my $p (@postpids) {
	my $prevals = $pre->{$p} || [0,0];
	my $dv=$post->{$p}->[0] - $prevals->[0];
	my $dn=$post->{$p}->[1] - $prevals->[1];
	printf(" % 5u | % 7u | % 7u | %s\n", $p, $dv, $dn, $taskinfo{$p})
	    if($dv || $dn);
    }
    print("\n");
}
# END context switch stats

if(!$do_cpuidle && !$do_cpufreq && !$do_interrupts && !$do_timerstats && !$do_ctxtsw) {
    print "No statistics can be gathered, aborting\n";
    exit;
}

# Timing preparations
print("Sleeping for $sleep seconds before collecting data for $time seconds\n");
my ($timepre1, $timepre2, $timepost1, $timepost2);
my $timecali=&$xtime if($calibrate);
sleep($sleep);
$timepre1=&$xtime;
if($sleep and $calibrate) {
    my $measured=$timepre1-$timecali;
    my $ratio=$measured/$sleep;
    if($ratio>1.01 or $ratio<0.99) { print("WARNING: No accurate timer found, relying on sleep itself.\n"); }
    else { $calibrate=0; } # we trust this measure
}

# Now actually gather the data
my %ctxtsw_pre=get_ctxtsw_data() if($do_ctxtsw);
my @cpuidle_pre=get_cpuidle_data() if($do_cpuidle);
my @cpufreq_pre=get_cpufreq_data() if($do_cpufreq);
my @interrupts_pre=get_interrupt_data() if($do_interrupts);
get_timerstats_data(1) if($do_timerstats);
$timepre2=&$xtime;
sleep($time);
$timepost1=&$xtime;
my %ctxtsw_post=get_ctxtsw_data() if($do_ctxtsw);
my @cpuidle_post=get_cpuidle_data() if($do_cpuidle);
my @cpufreq_post=get_cpufreq_data() if($do_cpufreq);
my @interrupts_post=get_interrupt_data() if($do_interrupts);
get_timerstats_data(0) if($do_timerstats);
$timepost2=&$xtime;

# @$mtime is the measurement time, which will probably be
#         longer than the time we requested.
if($calibrate) {
    $mtime = $time;
    printf("Presumably slept for %1.3fs\n", $mtime);
} else {
    $mtime =($timepost1-$timepre2); # how long the sleep took
    $mtime += (($timepre2-$timepre1)+($timepost2-$timepost1))/2 
	if($incompat); # plus the overhead of reading the values
    printf("Actually slept for %1.3fs\n", $mtime);
}

diff_cpuidle(\@cpuidle_pre, \@cpuidle_post) if($do_cpuidle);
diff_cpufreq(\@cpufreq_pre, \@cpufreq_post) if($do_cpufreq);
diff_interrupts(\@interrupts_pre, \@interrupts_post) if($do_interrupts);
diff_timerstats() if($do_timerstats);
diff_ctxtsw(\%ctxtsw_pre, \%ctxtsw_post) if($do_ctxtsw);

