#!/usr/bin/ksh
 #
 # iotop - display top disk I/O events by process.
 #         Written using DTrace (Solaris 10 3/05).
 #
 # This is measuring disk events that have made it past system caches.
 #
 # $Id: iotop 8 2007-08-06 05:55:26Z brendan $
 #
 # USAGE:	iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename] 
 #		      [-m mount_point] [-t top] [interval [count]]
 #
 #		iotop   	# default output, 5 second intervals
 #
 #		-C		# don't clear the screen
 #		-D		# print delta times, elapsed, us
 #		-j		# print project ID
 #		-o		# print disk delta times, us
 #		-P		# print %I/O (disk delta times)
 #		-Z		# print zone ID
 #		-d device	# instance name to snoop (eg, dad0)
 #		-f filename	# full pathname of file to snoop
 #		-m mount_point	# this FS only (will skip raw events)
 #		-t top		# print top number only
 #	eg,
 #		iotop 1  	# 1 second samples
 #		iotop -C	# don't clear the screen
 #		iotop -P	# print %I/O (time based)
 #		iotop -j	# print project IDs
 #		iotop -Z 	# print zone IDs
 #		iotop -t 20 	# print top 20 lines only
 #		iotop -C 5 12	# print 12 x 5 second samples
 # 	
 # FIELDS:
 #		UID		user ID
 #		PID		process ID
 #		PPID		parent process ID
 #		PROJ		project ID
 #		ZONE		zone ID
 #		CMD		process command name
 #		DEVICE  	device name
 #		MAJ     	device major number
 #		MIN     	device minor number
 #		D		direction, Read or Write
 #		BYTES		total size of operations, bytes
 #		ELAPSED		total elapsed from request to completion, us
 #		DISKTIME	total time for disk to complete request, us
 #		%I/O		percent disk I/O, based on time (DISKTIME)
 #		load		1 min load average
 #		disk_r		total disk read Kbytes for sample
 #		disk_w		total disk write Kbytes for sample
 # 
 # NOTE:
 # * There are two different delta times reported. -D prints the
 #   elapsed time from the disk request (strategy) to the disk completion
 #   (iodone); -o prints the time for the disk to complete that event 
 #   since it's last event (time between iodones), or, the time to the
 #   strategy if the disk had been idle. 
 # * The %I/O value can exceed 100%. It represents how busy a process is
 #   making the disks, in terms of a single disk. A value of 200% could 
 #   mean 2 disks are busy at 100%, or 4 disks at 50%...
 #
 # SEE ALSO: iosnoop
 #	    BigAdmin: DTrace, http://www.sun.com/bigadmin/content/dtrace
 #	    Solaris Dynamic Tracing Guide, http://docs.sun.com
 #	    DTrace Tools, http://www.brendangregg.com/dtrace.html
 #
 # INSPIRATION:  top(1) by William LeFebvre
 #
 # COPYRIGHT: Copyright (c) 2005, 2006 Brendan Gregg.
 #
 # CDDL HEADER START
 #
 #  The contents of this file are subject to the terms of the
 #  Common Development and Distribution License, Version 1.0 only
 #  (the "License").  You may not use this file except in compliance
 #  with the License.
 #
 #  You can obtain a copy of the license at Docs/cddl1.txt
 #  or http://www.opensolaris.org/os/licensing.
 #  See the License for the specific language governing permissions
 #  and limitations under the License.
 #
 # CDDL HEADER END
 #
 # KNOWN BUGS: 
 # - This can print errors while running on servers with Veritas volumes.
 #
 # Author: Brendan Gregg  [Sydney, Australia]
 #
 # 15-Jul-2005	Brendan Gregg	Created this.
 # 20-Apr-2006	   "      "	Last update.
 #
 
 
 ##############################
 # --- Process Arguments ---
 #
 
 ### default variables
 opt_device=0; opt_file=0; opt_mount=0; opt_clear=1; opt_proj=0; opt_zone=0
 opt_percent=0; opt_def=1; opt_bytes=1; filter=0; device=.; filename=.; mount=.
 opt_top=0; opt_elapsed=0; opt_dtime=0; interval=5; count=-1; top=0
 
 ### process options
 while getopts CDd:f:hjm:oPt:Z name
 do
 	case $name in
 	C)	opt_clear=0 ;;
 	D)	opt_elapsed=1; opt_bytes=0 ;;
 	d)	opt_device=1; device=$OPTARG ;;
 	f)	opt_file=1; filename=$OPTARG ;;
 	j)	opt_proj=1; opt_def=0 ;;
 	m)	opt_mount=1; mount=$OPTARG ;;
 	o)	opt_dtime=1; opt_bytes=0 ;;
 	P)	opt_percent=1; opt_dtime=1; opt_bytes=0 ;;
 	t)	opt_top=1; top=$OPTARG ;;
 	Z)	opt_zone=1; opt_def=0 ;;
 	h|?)	cat <<-END >&2
 		USAGE: iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename]
 		             [-m mount_point] [-t top] [interval [count]]
  
 		                -C      # don't clear the screen
 		                -D      # print delta times, elapsed, us
 		                -j      # print project ID
 		                -o      # print disk delta times, us
 		                -P      # print %I/O (disk delta times)
 		                -Z      # print zone ID
 		                -d device       # instance name to snoop 
 		                -f filename     # snoop this file only
 		                -m mount_point  # this FS only 
 		                -t top  	# print top number only
 		   eg,
 		        iotop         # default output, 5 second samples
 		        iotop 1       # 1 second samples
 		        iotop -P      # print %I/O (time based)
 		        iotop -m /    # snoop events on filesystem / only
 		        iotop -t 20   # print top 20 lines only
 		        iotop -C 5 12 # print 12 x 5 second samples
 		END
 		exit 1
 	esac
 done
 
 shift $(( $OPTIND - 1 ))
 
 ### option logic
 if [[ "$1" > 0 ]]; then
         interval=$1; shift
 fi
 if [[ "$1" > 0 ]]; then
         count=$1; shift
 fi
 if (( opt_proj && opt_zone )); then
         opt_proj=0
 fi
 if (( opt_elapsed && opt_dtime )); then
         opt_elapsed=0
 fi
 if (( opt_device || opt_mount || opt_file )); then
 	filter=1
 fi
 if (( opt_clear )); then
         clearstr=`clear`
 else
         clearstr=.
 fi
 
 
 
 #################################
 # --- Main Program, DTrace ---
 #
 /usr/sbin/dtrace -n '
  /*
   * Command line arguments
   */
  inline int OPT_def 	= '$opt_def';
  inline int OPT_proj 	= '$opt_proj';
  inline int OPT_zone 	= '$opt_zone';
  inline int OPT_clear 	= '$opt_clear';
  inline int OPT_bytes 	= '$opt_bytes';
  inline int OPT_elapsed = '$opt_elapsed';
  inline int OPT_dtime 	= '$opt_dtime';
  inline int OPT_percent	= '$opt_percent';
  inline int OPT_device 	= '$opt_device';
  inline int OPT_mount 	= '$opt_mount';
  inline int OPT_file 	= '$opt_file';
  inline int OPT_top 	= '$opt_top';
  inline int INTERVAL 	= '$interval';
  inline int COUNTER 	= '$count';
  inline int FILTER 	= '$filter';
  inline int TOP 	= '$top';
  inline string DEVICE 	= "'$device'";
  inline string FILENAME = "'$filename'";
  inline string MOUNT 	= "'$mount'";
  inline string CLEAR 	= "'$clearstr'";
  
  #pragma D option quiet
 
  /* boost the following if you get "dynamic variable drops" */
  #pragma D option dynvarsize=8m
 
  /*
   * Print header
   */
  dtrace:::BEGIN 
  {
 	last_event[""] = 0;
 
         /* starting values */
         counts = COUNTER;
         secs = INTERVAL;
         disk_r = 0;
         disk_w = 0;
 
         printf("Tracing... Please wait.\n");
  }
 
  /*
   * Check event is being traced
   */
  io:genunix::start,
  io:genunix::done 
  { 
 	/* default is to trace unless filtering, */
 	this->ok = FILTER ? 0 : 1;
 
 	/* check each filter, */
 	(OPT_device == 1 && DEVICE == args[1]->dev_statname)? this->ok = 1 : 1;
 	(OPT_file == 1 && FILENAME == args[2]->fi_pathname) ? this->ok = 1 : 1;
 	(OPT_mount == 1 && MOUNT == args[2]->fi_mount)  ? this->ok = 1 : 1;
  }
 
  /*
   * Reset last_event for disk idle -> start
   * this prevents idle time being counted as disk time.
   */
  io:genunix::start
  /! pending[args[1]->dev_statname]/
  {
 	/* save last disk event */
 	last_event[args[1]->dev_statname] = timestamp;
  }
 
  /*
   * Store entry details
   */
  io:genunix::start
  /this->ok/
  {
 	/* these are used as a unique disk event key, */
  	this->dev = args[0]->b_edev;
  	this->blk = args[0]->b_blkno;
 
 	/* save disk event details, */
  	start_uid[this->dev, this->blk] = uid;
  	start_pid[this->dev, this->blk] = pid;
  	start_ppid[this->dev, this->blk] = ppid;
  	start_comm[this->dev, this->blk] = execname;
  	start_time[this->dev, this->blk] = timestamp;
  	start_proj[this->dev, this->blk] = curpsinfo->pr_projid;
  	start_zone[this->dev, this->blk] = curpsinfo->pr_zoneid;
  	start_rw[this->dev, this->blk] = args[0]->b_flags & B_READ ? "R" : "W";
 	disk_r += args[0]->b_flags & B_READ ? args[0]->b_bcount : 0;
 	disk_w += args[0]->b_flags & B_READ ? 0 : args[0]->b_bcount;
 
 	/* increase disk event pending count */
 	pending[args[1]->dev_statname]++;
  }
 
  /*
   * Process and Print completion
   */
  io:genunix::done
  /this->ok/
  {
 	/* decrease disk event pending count */
 	pending[args[1]->dev_statname]--;
 
 	/*
 	 * Process details
 	 */
 
  	/* fetch entry values */
  	this->dev = args[0]->b_edev;
  	this->blk = args[0]->b_blkno;
  	this->suid = start_uid[this->dev, this->blk];
  	this->spid = start_pid[this->dev, this->blk];
  	this->sppid = start_ppid[this->dev, this->blk];
  	this->sproj = start_proj[this->dev, this->blk];
  	this->szone = start_zone[this->dev, this->blk];
  	self->scomm = start_comm[this->dev, this->blk];
  	this->stime = start_time[this->dev, this->blk];
 	this->etime = timestamp; /* endtime */
 	this->elapsed = this->etime - this->stime;
  	self->rw = start_rw[this->dev, this->blk];
 	this->dtime = last_event[args[1]->dev_statname] == 0 ? 0 :
 	    timestamp - last_event[args[1]->dev_statname];
 
  	/* memory cleanup */
  	start_uid[this->dev, this->blk]  = 0;
  	start_pid[this->dev, this->blk]  = 0;
  	start_ppid[this->dev, this->blk] = 0;
  	start_time[this->dev, this->blk] = 0;
  	start_comm[this->dev, this->blk] = 0;
  	start_zone[this->dev, this->blk] = 0;
  	start_proj[this->dev, this->blk] = 0;
  	start_rw[this->dev, this->blk]   = 0;
 
 	/*
 	 * Choose statistic to track
 	 */
 	OPT_bytes   ? this->value = args[0]->b_bcount    : 1;
 	OPT_elapsed ? this->value = this->elapsed / 1000 : 1;
 	OPT_dtime   ? this->value = this->dtime / 1000   : 1;
 	
 	/*
 	 * Save details
 	 */
 	OPT_def ? @out[this->suid, this->spid, this->sppid, self->scomm,
 	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
 	    self->rw] = sum(this->value) : 1;
 	OPT_proj ? @out[this->sproj, this->spid, this->sppid, self->scomm,
 	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
 	    self->rw] = sum(this->value) : 1;
 	OPT_zone ? @out[this->szone, this->spid, this->sppid, self->scomm,
 	    args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
 	    self->rw] = sum(this->value) : 1;
 
 	/* save last disk event */
 	last_event[args[1]->dev_statname] = timestamp;
 
 	self->scomm = 0;
 	self->rw = 0;
  }
 
  /*
   * Prevent pending from underflowing
   * this can happen if this program is started during disk events.
   */
  io:genunix::done
  /pending[args[1]->dev_statname] < 0/
  {
 	pending[args[1]->dev_statname] = 0;
  }
 
  /*
   * Timer
   */
  profile:::tick-1sec
  {
 	secs--;
  }
 
  /*
   * Print Report
   */
  profile:::tick-1sec
  /secs == 0/
  {
 	/* fetch 1 min load average */
 	this->load1a  = `hp_avenrun[0] / 65536;
 	this->load1b  = ((`hp_avenrun[0] % 65536) * 100) / 65536;
 
 	/* convert counters to Kbytes */
 	disk_r /= 1024;
 	disk_w /= 1024;
 
 	/* print status */
 	OPT_clear ? printf("%s", CLEAR) : 1;
 	printf("%Y,  load: %d.%02d,  disk_r: %6d KB,  disk_w: %6d KB\n\n",
 	    walltimestamp, this->load1a, this->load1b, disk_r, disk_w);
 
 	/* print headers */
 	OPT_def  ? printf("  UID ") : 1;
 	OPT_proj ? printf(" PROJ ") : 1;
 	OPT_zone ? printf(" ZONE ") : 1;
 	printf("%6s %6s %-16s %-7s %3s %3s %1s",
 	    "PID", "PPID", "CMD", "DEVICE", "MAJ", "MIN", "D");
 	OPT_bytes   ? printf(" %16s\n", "BYTES") : 1;
 	OPT_elapsed ? printf(" %16s\n", "ELAPSED") : 1;
 	OPT_dtime && ! OPT_percent  ? printf(" %16s\n", "DISKTIME") : 1;
 	OPT_dtime && OPT_percent    ? printf(" %6s\n", "%I/O") : 1;
 
 	/* truncate to top lines if needed */
 	OPT_top ? trunc(@out, TOP) : 1;
 
 	/* normalise to percentage if needed */
 	OPT_percent ? normalize(@out, INTERVAL * 10000) : 1;
 
 	/* print data */
 	! OPT_percent ? 
 	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %16@d\n", @out) :
 	    printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %6@d\n", @out);
 	printf("\n");
 
 	/* clear data */
 	trunc(@out);
 	disk_r = 0;
 	disk_w = 0;
 	secs = INTERVAL;
 	counts--;
  }
 
  /*
   * End of program
   */
  profile:::tick-1sec
  /counts == 0/
  {
 	exit(0);
  }
 
  /*
   * Cleanup for Ctrl-C
   */
  dtrace:::END
  {
 	trunc(@out);
  }
 '