#!/usr/bin/perl -w
#
# gewake - monitor grid engine and wake up hosts if needed
#
# Copyright (C) 2012 Orion Poplawski <orion@cora.nwra.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

use Getopt::Std;
use Date::Manip;

# Convert units into raw sizes
sub memValue {
   my ($arg) = @_;

   $arg =~ s/([KMG])//;
   return $arg if !defined($1);
   if ($1 eq "K") {
      $arg *= 1024;
   } elsif ($1 eq "M") {
      $arg *= 1024*1024;
   } elsif ($1 eq "G") {
      $arg *= 1024*1024*1024;
   }
   return $arg;
}

sub isJobHeld {
   my ($jobid) = @_;
   my $held = 0;

   open(QSTATH,"qstat -j $jobid |");
   while (<QSTATH>) {
      $held = 1 if /Job is in hold state/;
   }
   close(QSTATH);
   return $held;
}

# Load a parallel environment
sub loadParallelEnv {
   my ($envname) = @_;
   my %envconf;
   my %hostqueues;

   open(ENV,"qconf -sp $envname |");
   while (<ENV>) {
      chomp;
      my ($key,$value) = split(/ +/, $_);
      $envconf{$key} = $value;
   }
   close(ENV);

   # Find the hostqueues in the parallel environment
   open(ENV,"qstat -f -pe $envname |");
   while (<ENV>) {
      chomp;
      my ($queue,$host,$slots) = ($_ =~ m,^(\S+)@(\S+)\s+\S+\s+\d+/\d+/(\d+),);
      if (defined($slots)) {
         # Strip domain
         $host =~ s/\.[^@]*$//;
         if (defined($hostqueues{$queue})) {
            $hostqueues{$queue} += $slots;
         } else {
            $hostqueues{$queue} = $slots;
         }
         $hostqueues{"$queue\@$host"} = $slots;
         print "parallel env $envname hostqueues $queue\@$host = $slots\n" if (defined($opts{'d'}));
         print "parallel env $envname hostqueues $queue = $hostqueues{$queue}\n" if (defined($opts{'d'}));
      }
   }
   close(ENV);
   $envconf{'hostqueues'} = \%hostqueues;

   return \%envconf;
}

sub loadParallelEnvs {
   my ($envname) = @_;
   my %envconf;

   if ($envname =~ /\*/) {
      $envregex = $envname;
      $envregex =~ s/\*/.*/;
      open(ENVS,"qconf -spl |");
      while (<ENVS>) {
         chomp;
         my $thisenv = $_;
         if (/$envregex/) {
            $envconf{$thisenv} = loadParallelEnv($thisenv);
         }
      }
      close(ENVS);
   } else {
      $envconf{$envname} = loadParallelEnv($envname);
   }

   return \%envconf;
}

my $version = "1.9";
my $wakehostsfile = "/usr/local/etc/gewake.hosts";
my $wakeupcmd = "/usr/local/bin/wakeup";
my %status = ();
my $usage = <<EOU;
Usage: gewake [-d]

 -d Turns on debugging output
EOU

getopts("d",\%opts) || die $usage;

#These are the hosts we wake up, put in order of preference
open(HOSTS,"< $wakehostsfile") or die "Cannot open $wakehostsfile";
while (<HOSTS>) {
   chomp;
   s/#.*//;
   my ($hostname,$wait) = split;
   push(@wakeupHosts,$hostname) if $hostname;
   $wakeupHostsWait{$hostname} = $wait if $wait;
}
close(HOSTS);

# See if there are any waiting jobs
#job-ID  prior   name       user         state submit/start at     queue                 slots ja-task-ID
my %jobinfo;
my $hold;
my $tries = 0;
do {
   $hold = 0;
   open(QSTAT,'qstat -u \* |');
   my $now = new Date::Manip::Date("now");
   while (<QSTAT>) {
      chomp;
      my ($jobid,$prior,$name,$user,$state,$submitdate,$submittime,$slots,$tasklist) = split;
      if (defined($state) && $state =~ /h/ && $state !~ /E/) {
         # Job is in hold state - see if it is due to us
         if (&isJobHeld($jobid)) {
            $hold = 1;
            # If it is, just release it for now
            print("Releasing hold on $jobid: ");
            system("qrls $jobid") unless defined($opts{'d'});
         }
      } elsif (defined($state) && $state =~ /q/ && $state !~ /E/) {
         # Determine how many seconds ago the job was submitted
         my $subdate = new Date::Manip::Date("$submitdate $submittime");
         my $seconds = $now->calc($subdate, 1)->printf('%sys');

         # Wait for 60 seconds for scheduler to process new jobs
         if ($seconds > 60 or defined($opts{'d'})) {
            push(@waiting,$jobid);

            #Track some other info
            my $numtasks = 0;
         print "$jobid slots=$slots, tasklist=$tasklist" if (defined($opts{'d'}));
            if (defined($tasklist) and $tasklist ne "") {
               $jobinfo{$jobid}->{"tasklist"} = $tasklist;
               foreach $taskentry (split(",",$tasklist)) {
                  my ($starttask,$endtask,$stride);
                  if (($starttask,$endtask,$stride) = ($taskentry =~ /(\d+)-(\d+):(\d+)/)) {
                     $numtasks += ($endtask-$starttask+1)/$stride;
                  } elsif ($taskentry =~ /^\d+$/) {
                     $numtasks++;
                  }
               }
            } else {
               $numtasks = 1;
            }
            print "($numtasks)\n" if (defined($opts{'d'}));
            $jobinfo{$jobid}->{"slots"} = $slots * $numtasks;
            $jobinfo{$jobid}->{"user"} = $user;
         }
      }
   }
   close(QSTAT);
   $tries++;
   # Let released jobs get scheduled if possible
   sleep(10) if $hold;
} while ($hold and $tries < 10);

# Open status files
# Last hosts we tried to wake up and wait timer - we want to clear this if there are no
# waiting jobs so run before the no waiting jobs check below
my %lastwake;
if (-f "/var/lib/gewake/last") {
   open(LAST,"< /var/lib/gewake/last") or die "Cannot open /var/lib/gewake/last";
   while (<LAST>) {
      chomp;
      my ($hostname,$wait) = split;
      $lastwake{$hostname} = $wait;
   }
   close(LAST);
   # clear the last woken up hosts status file
   unlink("/var/lib/gewake/last") unless defined($opts{'d'});
}

# Quit if no waiting jobs
if (not @waiting) {
   print("No waiting jobs\n") if (defined($opts{'d'}));
   exit;
}

# List of hosts to not wake up
@blacklist = ();
if (-f "/var/lib/gewake/blacklist") {
  open(BL,"< /var/lib/gewake/blacklist");
  while (<BL>) {
    chomp;
    push(@blacklist,$_);
  }
  close(BL);
}

# Debug
if (defined($opts{'d'}) and @waiting) {
  print "Waiting: " . join(",",@waiting) . "\n";
}

# Get information on complexes
# qconf -sc
#  #name               shortcut    type        relop requestable consumable default  urgency
#  #-----------------------------------------------------------------------------------------
#  arch                a           RESTRING    ==    YES         NO         NONE     0
my (%complex);
open(QCONF,'qconf -sc |') or die 'Cannot run qconf -sc';
while (<QCONF>) {
   next if /^#/;
   chomp;
   my ($name,$shortcut,$type,$relop,$req,$con,$default,$urgency) = split(" ");
   # Only need requestable ones
   if ($req eq "YES" or $req eq "FORCED") {
      $complex{$name}->{"type"} = $type;
      # Modify operator for string comparisons
      if ($type eq "RESTRING") { $relop = "eq" };
      $complex{$name}->{"relop"} = $relop;
      print "complex: $name $type $relop\n" if defined($opts{'d'});
   }
}

# Collect queue information
#queuename                      qtype used/tot. load_avg arch          states
open(QSTAT,'qstat -f -F|');
my ($queue,$host);
while (<QSTAT>) {
   chomp;
   if (/@/) {
      #if ($opts{'d'} and $queue) {
      #   print "$queue\@$host: ";
      #   foreach my $resource (keys(%{$qhresource{"$queue\@$host"}})) {
      #      print "$resource=" . $qhresource{"$queue\@$host"}->{$resource} . ", ";
      #   }
      #   print "\n";
      #}

      my ($queueent,$qtype,$slots,$loadave,$arch,$state) = split;
      # Strip domain
      $queueent =~ s/\.[^@]*$//;
      ($queue,$host) = split("@",$queueent);

      # Debug
      print "$_ -> ($queue,$host)\n" if (defined($opts{'d'}));

      # Add to list of queues unless it is disabled
      push(@{$queuesForHost{$host}},$queue) unless $state =~ /d/;

      # Record default status for queues that are up.  We don't set
      # the status for down ones because we may override them later
      if ($state !~ "u") {
         $status{$queueent} = "up";
      }
      if ($state =~ "d") {
         $status{$queueent} = "disabled";
      }
      if ($state =~ "S") {
         $status{$queueent} = "suspended";
      }

      # Record queue type
      $queuetype{$queue} = $qtype
   } elsif (my ($type,$resource,$value) = /(\w\w):(\S+)=(\S+)/) {
      # Parse the resources for the queue
      print STDERR "Cannot parse $_ for resources\n" unless defined($resource);
      if (defined($complex{$resource}) and $complex{$resource}->{"type"} eq "MEMORY") { $value = &memValue($value); }
      $value =~ s/infinity/Infinity/;
      $qhresource{"$queue\@$host"}->{$resource} = $value;
   }
}
close(QSTAT);

# See why the jobs are waiting
# qstat -j output:
#hard_queue_list:   compute.q
#scheduling info:   queue instance "queue@hostname"
#                   queue instance "queue@hostname" dropped because it is temporarily not available
#                   queue instance "queue@hostname" dropped because it is full
#                   queue instance "queue@hostname" dropped because it is disabled
#                   cannot run in queue "queue" because it is not contained in its hard queue list (-q)
my @wokenhosts = ();
my $slotsneeded = 0;
# Number of hosts woken for this job
my $woke = 0;
# Hosts who have had their wakeup counter decremented
my %lastwakedecremented;
my $holdneeded = 0;
JOB: foreach $jobid (@waiting) {
   print("Processing $jobid(slots=$jobinfo{$jobid}->{slots})\n") if (defined($opts{'d'}));
   my $schedinfo = 0;
   my @hardqueuelist = ();
   my $error = 0;
   my %hardresource = ();
   my $parallelenvconf = undef;
   my $parallelenv = "";
   my $parallelrange = "";
   my $parallelalloc = "";
   my ($perangemin,$perangemax);
   my $penvfound = "";
   $holdneeded = 0;
   $woke = 0;
   open(QSTAT,"qstat -j $jobid |");
   QSTATLINE: while (<QSTAT>) {
      chomp;
      # Job is in error state
      if (/^error reason/) {
         $error = 1;
         next QSTATLINE;
      }

      # Get the hard resources to see if we can avoid waking up machines that
      # don't satisfy them
      if (/^hard resource_list:\s+(\S+)/) {
         foreach $item (split(",",$1)) {
            my ($resource,$value) = split("=",$item);
            if ($resource =~ /^mem_/) { $value = &memValue($value); }
            $hardresource{$resource} = $value;
         }
         next QSTATLINE;
      }

      # Signal start of scheduling info parsing
      $schedinfo = 1 if s/^scheduling info://;

      if (!$schedinfo) {
         if (/^hard_queue_list:\s*(.*)/) {
            # Strip domain
            my $hqlist = $1;
            $hqlist =~ s/(@\w+)\.[\w.]+/$1/g;
            @hardqueuelist = split(",",$hqlist);
            next QSTATLINE;
         } elsif (/^parallel environment:\s*(.*) range:\s*(.*)/) {
            $parallelenv = $1;
            $parallelenvconf = loadParallelEnvs($parallelenv) if !defined($parallelenvconf->{$parallelenv});
            foreach my $penv (keys(%$parallelenvconf)) {
               # Only set parallelalloc to $pe_slots if ALL PEs are $pe_slots
               if ($parallelalloc eq "" or $parallelalloc eq '$pe_slots') {
                  $parallelalloc = $parallelenvconf->{$penv}->{"allocation_rule"};
                  print STDERR "Adding parallel env $penv alloc $parallelalloc to job $jobid\n" if (defined($opts{'d'}));
               }
            }
            ($perangemin,$perangemax) = ($2 =~ /(\d+)-?(\d+)?/);
            # Try to wake up as many as possible, unless this is an SMP PE request
            $jobinfo{$jobid}->{"slots"} = (defined($perangemax) and $parallelalloc ne "\$pe_slots") ? $perangemax : $perangemin;
            print STDERR "Setting slots to $jobinfo{$jobid}->{slots} for parallelenv $parallelenv alloc $parallelalloc\n" if (defined($opts{'d'}));
            next QSTATLINE;
         }
         next QSTATLINE;
      } else {
         my $junk;
         if (($queue,$status) = /queue instance "(.+)" (?:dropped |)because it is (.+)/) {
            # Strip domain
            $queue =~ s/\.[^@]*$//;

            # Set the status if it isn't already "up"
            $status{$queue} = $status unless defined($status{$queue});

            # Debug
            print "status{$queue} = $status\n" if (defined($opts{'d'}));
            next QSTATLINE;
         } elsif (($queue,$status) = /queue instance "(.+)" is in (.+)/) {
            # Strip domain
            $queue =~ s/\.[^@]*$//;

            # Set the status if it isn't already "up"
            $status{$queue} = $status unless defined($status{$queue});

            # Debug
            print "status{$queue} = $status\n" if (defined($opts{'d'}));
            next QSTATLINE;
         } elsif (($queue) = /has no permission for cluster queue "(.+)"/) {
            $status{$queue} = "noperm";

            # Debug
            print "status{$queue} = $status{$queue}\n" if (defined($opts{'d'}));
            next QSTATLINE;
         } else {
            # We don't think we care about anything else, but list it if debugging
            print "Unknown status line: $_\n" if (defined($opts{'d'}));
            next QSTATLINE;
         }
      }
   }
   close(QSTAT);

   # Don't process error state jobs
   next if $error;

   # TODO - keeps track of slots per host
   $slotsneeded += $jobinfo{$jobid}->{slots};
   next JOB if $slotsneeded < 0;

   # Go through the list of wakeupHosts in order and wake up if needed
   my @hostsToWake = ();
   $jobslotsneeded = $jobinfo{$jobid}->{slots};
   HOST: foreach $host (@wakeupHosts) {
      # We may not have any queues because they may be disabled
      next if !defined($queuesForHost{$host});

      # Skip if we've already tried to wake it up
      next if grep($_ eq $host,@wokenhosts);

      # Loop through the queues
      QUEUE: foreach my $queue (@{$queuesForHost{$host}}) {
      #print STDERR "No queue for host $host\n" if !defined($queue);
         my $queuehost = "$queue\@$host";

         # Debug
         print "Checking $queuehost status{$queuehost}=" . $status{$queuehost} . "\n" if (defined($opts{'d'}));

         # Skip queue if not in hardqueuelist
         if (@hardqueuelist) {
            my $hardqueuematched = 0;
            QUEUEENTRY: foreach my $hardqueueentry (@hardqueuelist) {
               $hardqueuematched = 1 if $hardqueueentry eq $queue or $hardqueueentry eq $queuehost;
            }
            if (!$hardqueuematched) {
               print "Skipping $queuehost not in hardqueuelist " . join(",",@hardqueuelist) . "\n" if (defined($opts{'d'}));
               next QUEUE;
            }
         }

         # Skip queue if no permission
         if (defined($status{$queue}) and $status{$queue} eq "noperm") {
            print "Skipping $queuehost no permission for $queue\n" if (defined($opts{'d'}));
            next QUEUE;
         }

         # Skip queue if it is the wrong type
         if ($parallelenv eq "" and $queuetype{$queue} !~ /B/) {
            print("$jobid Skipping $queue because it is not of type batch\n") if (defined($opts{'d'}));
            next QUEUE
         }
         if ($parallelenv ne "" and $queuetype{$queue} !~ /P/) {
            print("$jobid Skipping $queue because it is not of type parallel\n") if (defined($opts{'d'}));
            next QUEUE
         }

         # Wakeup if the host is down
         if (defined($status{$queuehost}) && $status{$queuehost} eq "temporarily not available") {
            # Check hard resources using the appropriate complex method
            foreach $resource (sort(keys(%hardresource))) {
               print STDERR "Checking resource requirement $resource: " if (defined($opts{'d'}));
               if (defined($qhresource{$queuehost}->{$resource})) {
                  my $thishardresource = $hardresource{$resource};
                  # Booleans are reported as numeric, but can be specified as true/false
                  $thishardresource =~ s/true/1.000000/;
                  my $slots = ($jobinfo{$jobid}->{slots} > $qhresource{$queuehost}->{'slots'}) ? $qhresource{$queuehost}->{'slots'} : $jobinfo{$jobid}->{slots};
                  # TODO - This isn't "MEMORY" per se but consumable "YES" vs "JOB", but possibly only for MEMORY types
                  #print STDERR "next HOST if !($thishardresource" . (($complex{$resource}->{'type'} eq "MEMORY") ? " * $slots" : "") . ") $complex{$resource}->{relop} " . $qhresource{$queuehost}->{$resource} . ")\n" if (defined($opts{'d'}));
                  print STDERR "next HOST if !($thishardresource) $complex{$resource}->{relop} " . $qhresource{$queuehost}->{$resource} . ")\n" if (defined($opts{'d'}));
                  if ($complex{$resource}->{'relop'} eq "eq") {
                     # Need to quote strings
                     next HOST if eval "!(\"$thishardresource\" $complex{$resource}->{'relop'} \"$qhresource{$queuehost}->{$resource}\") ";
                  } else {
                     #next HOST if eval "!(($thishardresource" . (($complex{$resource}->{'type'} eq "MEMORY") ? " * $slots" : "") . ") $complex{$resource}->{'relop'} $qhresource{$queuehost}->{$resource})";
                     next HOST if eval "!(($thishardresource) $complex{$resource}->{'relop'} $qhresource{$queuehost}->{$resource})";
                  }
               } else {
                  next HOST;
               }
            }

            # Handle parallel environment requests
            if ($parallelenv ne "") {
               if ($parallelalloc eq '$pe_slots') {
                  if ($perangemin > $qhresource{$queuehost}->{"slots"}) {
                     print STDERR "Skipping $host because parallel env $parallelenv needs $perangemin slots > " . $qhresource{$queuehost}->{"slots"} . "\n" if (defined($opts{'d'}));
                     next HOST;
                  }
               }
               my @parallelenvs = keys(%$parallelenvconf);
               @parallelenvs = ($penvfound) if $penvfound;
               my $thispenvfound = 0;
               PENV: foreach my $penv (@parallelenvs) {
                  print STDERR "Checking parallel env $penv for $queuehost\n" if (defined($opts{'d'}));
                  if (!defined($parallelenvconf->{$penv}->{'hostqueues'}->{$queuehost})) {
                        print STDERR "Skipping $queuehost because it does not contain parallel env $penv\n" if (defined($opts{'d'}));
                        next PENV;
                  } elsif ($perangemin > $parallelenvconf->{$penv}->{'hostqueues'}->{$queue}) {
                        print STDERR "Skipping $queue because parallel env $penv needs $perangemin slots > " . $parallelenvconf->{$penv}->{'hostqueues'}->{$queue} . "\n" if (defined($opts{'d'}));
                        next PENV;
                  }
                  $thispenvfound = $penvfound = $penv;
                  print STDERR "Selecting parallel env $penv for $queuehost\n" if (defined($opts{'d'}));
                  last PENV;
               }
               if (!$thispenvfound) {
                  print STDERR "Skipping host $host because no suitable parallel env found\n" if (defined($opts{'d'}));
                  next HOST;
               }
            }

            if (grep(/^$host$/,@blacklist)) {
               #Blacklisted, skip
               print STDERR "Skipping blacklisted host $host\n" if (defined($opts{'d'}));
               next HOST;
            } elsif (defined($lastwake{$host})) {
               if (!defined($lastwakedecremented{$host})) {
                  $lastwake{$host}--;
                  $lastwakedecremented{$host} = 1;
                  if ($lastwake{$host} <= 0) {
                     print("Giving up on $host, trying the next one.\n");
                     delete $lastwake{$host};
                     # This isn't working well right now, but the idea is to give up on failing hosts
                     if (!defined($opts{'d'})) {
                        open(BL,">> /var/lib/gewake/blacklist");
                        print BL "$host\n";
                        close(BL);
                     }
                  } else {
                     print STDERR "Still waiting on $host for job $jobid - $lastwake{$host} more times\n"; # if (defined($opts{'d'}));
                  }
               }
            } else {
               print("Queing wake up of $host for jobid $jobid on queue $queue,") if defined($opts{'d'});
               print(" pe = $penvfound,") if $penvfound and defined($opts{'d'});
               print(" slots = ". $qhresource{$queuehost}->{"slots"} . ", status = " . $status{$queuehost} . ", slots needed = $slotsneeded\n") if defined($opts{'d'});

               # Add to list of hosts to wake
               push(@hostsToWake,$host);

               # If this is a multi-host parallel job with a range of slots, try to wait until all hosts are awake
               if ($penvfound and $parallelalloc and $parallelalloc ne '$pe_slots' and defined($perangemax)) {
                  print(" setting holdneeded\n") if defined($opts{'d'});
                  $holdneeded = 1;
               }
            }

            # Reduce the slots needed by the slots provided by this host
            $slotsneeded -= $qhresource{$queuehost}->{"slots"};
            $jobslotsneeded -= $qhresource{$queuehost}->{"slots"};
            print("slots needed = $slotsneeded, jobslotsneeded = $jobslotsneeded\n") if (defined($opts{'d'}));

            # Stop looking for hosts if we have enough for this job
            last HOST if $jobslotsneeded <= 0;
         }
      }
   }

   # If we are not a parallel job, or if we can wake up enough hosts, or if we just want as many as possible...
   if ($parallelenv eq "" or $jobslotsneeded <= 0 or $jobslotsneeded > 9000000) {
      HOST: foreach $host (@hostsToWake) {
         print("Waking up $host for job $jobid\n") if ($jobinfo{$jobid}->{"user"} ne "root");
         system("$wakeupcmd $host") unless defined($opts{'d'});
         $woke++;
         push(@wokenhosts,$host);

         # Start the wake up host counter
         $lastwake{$host} = $wakeupHostsWait{$host};

         # Sleep a bit to stagger power ups
         sleep(5);
      }
   } else {
      print "Skipping jobid $jobid, still need $jobslotsneeded slots for parallel job\n" if defined($opts{'d'});
   }
} continue {
   # Place a job hold if waking up multiple hosts
   if ($holdneeded and $woke > 1) {
      print("Placing hold on $jobid: ");
      system("qhold -o u $jobid") unless defined($opts{'d'});
   }
}

# Record that last host(s) woken up
if (%lastwake and !defined($opts{'d'})) {
   open(LAST,"> /var/lib/gewake/last") or die;
   foreach $host (sort(keys(%lastwake))) {
      print LAST "$host $lastwake{$host}\n";
   }
   close(LAST);
}
