[gs-cvs] rev 8311 - trunk/cluster/tti

giles at ghostscript.com giles at ghostscript.com
Mon Oct 22 16:43:46 PDT 2007


Author: giles
Date: 2007-10-22 16:43:45 -0700 (Mon, 22 Oct 2007)
New Revision: 8311

Modified:
   trunk/cluster/tti/reg_dispatch-pcl.py
   trunk/cluster/tti/reg_dispatch.py
   trunk/cluster/tti/run_regression
Log:
Various updates.

* Use the new 'build32' machine for compilation.
* Avoid the new 'orange' cluster as it's 64 bits.
* Make the pcl regression run revisions in order
  besides accidentally.
* Hack around split cluster applications with ppn=2.
* Consolidate pcl job dispatch code.


Modified: trunk/cluster/tti/reg_dispatch-pcl.py
===================================================================
--- trunk/cluster/tti/reg_dispatch-pcl.py	2007-10-21 20:49:29 UTC (rev 8310)
+++ trunk/cluster/tti/reg_dispatch-pcl.py	2007-10-22 23:43:45 UTC (rev 8311)
@@ -93,6 +93,7 @@
   'Read the queue directory and select a revision to test'
   if not os.path.exists(cachedir): os.mkdir(cachedir)
   revs = os.listdir(cachedir)
+  revs.sort()
   try:
     rev = revs[0]
     os.unlink(os.path.join(cachedir, rev))
@@ -136,7 +137,7 @@
       procs = int(m.group("procs"))
       free = int(m.group("free"))
       # remember the cluster with the most free nodes
-      if free > nodes and name != 'total': 
+      if free > nodes and name != 'orange' and name != 'total': 
         nodes = free
         cluster = name
       clusters.append((name,procs,free))
@@ -152,18 +153,31 @@
   and a normal print command for progress and error messages.'''
   print '[' + time.ctime() + '] ' + msg
 
-def pbsjob(cmd, resources=None, stdout=None, stderr=None, mpi=True):
+def pbsjob(cmd, resources=None, workdir=None, 
+	stdout=None, stderr=None, mpi=True):
   if not resources:
     cluster, nodes = choosecluster()
+    while nodes < 1 or cluster == None:
+	log('clusters busy, waiting for an opening')
+	time.sleep(300)
+	cluster, nodes = choosecluster()
     if nodes > 1 and cluster == 'red' or cluster == 'green':
-      # red reports two cpus per node
+      # request two cpus per node
       nodes /= 2
       ppn = ':ppn=2'
+      # hack around an edge case
+      nodes -= 1
     else:
       ppn = ''
     resources = 'nodes=%d:%s:run%s,walltime=20:00' % (nodes, cluster, ppn)
-    print 'requesting', nodes, 'nodes on', cluster
-  f = open('regress.pbs', 'w')
+    if ppn:
+      ppnhelp = '(' + ppn[-1] + ' cpus per node)'
+    else:
+      ppnhelp = ''
+    print 'requesting', nodes, 'nodes on', cluster, ppnhelp
+  if stdout: filename = stdout + '.pbs'
+  else: filename = 'regress.pbs'
+  f = open(filename, 'w')
   f.write('#PBS -l ' + resources)
   if stdout:
     f.write(' -o ' + stdout)
@@ -171,7 +185,10 @@
       f.write(' -j oe')
     elif stderr:
       f.write(' -e ' + stderr)
-  f.write(' -d ' + os.getcwd())
+  if workdir:
+    f.write(' -d ' + workdir)
+  else:
+    f.write(' -d ' + os.getcwd())
   f.write('\n\n')
   if mpi:
     f.write('mpiexec -comm mpich2-pmi ')
@@ -180,7 +197,7 @@
   f.write(cmd)
   f.write('\n')
   f.close()
-  os.system('qsub regress.pbs')
+  os.system('qsub ' + filename)
 
 def build(workdir=None, clean=False):
   'compile an executable from the current source'
@@ -188,26 +205,36 @@
     cmd = "make clean && nice ./autogen.sh && nice make"
   else:
     cmd = "nice make"
-  if workdir:
-    cmd = "cd " + workdir + " && " + cmd
-  if False:
-    # build on the dispatch host
-    make = os.system(cmd)
-    make = make >> 8
-  else:
-    # FIXME: alternate build on a compile node
-    report = 'update.log'
-    resources = 'nodes=1:compile'
-    cmd += "\nexit"
-    if os.path.exists(report): os.unlink(report)
-    make = pbsjob(cmd, resources, stdout=report, mpi=False)
-    while not os.path.exists(report): time.sleep(5)
+  #if workdir:
+  #  cmd = "cd " + workdir + " && " + cmd
+  report = 'update.log'
+  resources = 'nodes=1:build32'
+  cmd += "\nexit"
+  if os.path.exists(report): os.unlink(report)
+  make = pbsjob(cmd, resources, workdir, stdout=report, mpi=False)
+  while not os.path.exists(report): time.sleep(5)
   if make:
     log("build failed! exit code " + str(make))
     return False
   # update successful
   return True
 
+def runrev(workdir=None, rev=None, report=None):
+  if not rev: rev = getrev()
+  if not report: report = "regression-r" + rev + ".log"
+  # remove the report if it exists since we use this to check completion
+  if os.path.exists(report): os.unlink(report)
+  start = time.time()
+  cmd = 'bwpython ../regress.py'
+  cmd += ' --batch --update'
+  cmd += ' --exe main/obj/pcl6'
+  pbsjob(cmd, resources=None, workdir=workdir, stdout=report)
+  # wait for the run to finish
+  while not os.path.exists(report):
+    time.sleep(20)
+  print "report is ready as '" + report + "'. total time %d seconds" % int(time.time() - start)
+  ircfile(report, rev)
+  mailfile(report, rev)
 
 def mainloop():
   log("starting up")
@@ -227,41 +254,8 @@
       if not os.path.exists(os.path.join(workdir, "reg_baseline.txt")):
         os.system("cp reg_baseline.txt " + workdir)
       log("running regression on ghostpcl-r" + rev)
-      start = time.time()
       report = "regression-r" + rev + ".log"
-      # remove the report if it exists since we use this to check completion
-      if os.path.exists(report): os.unlink(report)
-      f = open('regress.pbs', 'w')
-      cluster, nodes = choosecluster()
-      if nodes > 1 and (cluster == 'red' or cluster == 'green'):
-        # red reports two cpus per node
-        nodes /= 2
-        ppn = ':ppn=2'
-      else:
-        ppn = ''
-      f.write('#PBS -l nodes=%s:%s:run%s,walltime=20:00' % 
-        (nodes, cluster, ppn))
-      f.write(' -o ' + report)
-      #f.write(' -j oe')
-      #f.write(' -e /dev/null')
-      f.write(' -e ' + report + '.err')
-      f.write(' -d ' + os.path.join(os.getcwd(), workdir))
-      f.write('\n\n')
-      f.write('mpiexec -comm mpich2-pmi ')
-      f.write(' -nostdin -kill -nostdout')
-      f.write(' bwpython ../regress.py')
-      f.write(' --batch --update')
-      f.write(' --exe main/obj/pcl6')
-      f.write('\n')
-      f.close()
-      print 'requesting', nodes, 'nodes on', cluster
-      os.system('qsub regress.pbs')
-      # wait for the run to finish
-      while not os.path.exists(report):
-        time.sleep(20)
-      print "report is ready as '" + report + "'. total time %d seconds" % int(time.time() - start)
-      ircfile(report, rev)
-      #mailfile(report, rev)
+      runrev(workdir, rev, report)
       os.system("cp " + os.path.join(workdir, "reg_baseline.txt ") + " .")
     else:
       if doing:

Modified: trunk/cluster/tti/reg_dispatch.py
===================================================================
--- trunk/cluster/tti/reg_dispatch.py	2007-10-21 20:49:29 UTC (rev 8310)
+++ trunk/cluster/tti/reg_dispatch.py	2007-10-22 23:43:45 UTC (rev 8311)
@@ -90,7 +90,8 @@
      returns a (cluster_name, node_count) tuple.'''
   # figure out how many nodes are free
   r = re.compile('^\s+(?P<cluster>\w+).*\s+(?P<procs>\d+)\s+(?P<free>\d+)\s*$')
-  clusters=[]
+  clusters = []
+  cluster = None
   nodes = 0
   upnodes = os.popen("upnodes")
   for line in upnodes.readlines():
@@ -100,7 +101,7 @@
       procs = int(m.group("procs"))
       free = int(m.group("free"))
       # remember the cluster with the most free nodes
-      if free > nodes and name != 'total': 
+      if free > nodes and name != 'orange' and name != 'total': 
         nodes = free
         cluster = name
       clusters.append((name,procs,free))
@@ -113,6 +114,8 @@
       # red reports two cpus per node
       nodes /= 2
       ppn = ':ppn=2'
+      # hack: work around a corner case
+      if nodes > 1: nodes = nodes - 1
     else:
       ppn = ''
     resources = 'nodes=%d:%s:run%s,walltime=20:00' % (nodes, cluster, ppn)
@@ -161,7 +164,7 @@
     make = make >> 8
   else:
     # build on a compile node
-    resources = 'nodes=1:compile'
+    resources = 'nodes=1:build32'
     report = 'update.log'
     if os.path.exists(report): os.unlink(report)
     make = pbsjob(cmd, resources, stdout=report, stderr=report, mpi=False)

Modified: trunk/cluster/tti/run_regression
===================================================================
--- trunk/cluster/tti/run_regression	2007-10-21 20:49:29 UTC (rev 8310)
+++ trunk/cluster/tti/run_regression	2007-10-22 23:43:45 UTC (rev 8311)
@@ -25,7 +25,7 @@
       procs = int(m.group("procs"))
       free = int(m.group("free"))
       # remember the cluster with the most free nodes
-      if free > nodes and name != 'total': 
+      if free > nodes and name != 'orange' and name != 'total': 
         nodes = free
         cluster = name
       clusters.append((name,procs,free))
@@ -92,7 +92,7 @@
     make = make >> 8
   else:
     # build on a compile node
-    resources = 'nodes=1:compile'
+    resources = 'nodes=1:build32'
     report = 'build-' + rev + '.log'
     if os.path.exists(report): os.unlink(report)
     make = pbsjob(cmd, resources, stdout=report, stderr=report, mpi=False)



More information about the gs-cvs mailing list