001/* 002 * Copyright (c) 2004-2010 The Regents of the University of California. 003 * All rights reserved. 004 * 005 * '$Author: crawl $' 006 * '$Date: 2012-07-27 18:35:29 +0000 (Fri, 27 Jul 2012) $' 007 * '$Revision: 30295 $' 008 * 009 * Permission is hereby granted, without written agreement and without 010 * license or royalty fees, to use, copy, modify, and distribute this 011 * software and its documentation for any purpose, provided that the above 012 * copyright notice and the following two paragraphs appear in all copies 013 * of this software. 014 * 015 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY 016 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 017 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 018 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF 019 * SUCH DAMAGE. 020 * 021 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 022 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 023 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE 024 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF 025 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 026 * ENHANCEMENTS, OR MODIFICATIONS. 027 * 028 */ 029 030package org.kepler.job; 031 032import org.apache.commons.logging.Log; 033import org.apache.commons.logging.LogFactory; 034 035/** 036 * Support class for IBM's LoadLeveler job manager support Class JobManager uses 037 * the methods of a supporter class to submit jobs and check status 038 */ 039public class JobSupportLoadLeveler implements JobSupport { 040 041 private static final Log log = LogFactory 042 .getLog(JobSupportLoadLeveler.class.getName()); 043 private static final boolean isDebugging = log.isDebugEnabled(); 044 045 public JobSupportLoadLeveler() { 046 } 047 048 public void init(String llBinPath) { 049 if (llBinPath != null && !llBinPath.trim().equals("")) { 050 String binPath = new String(llBinPath); 051 if (!llBinPath.endsWith("/")) 052 binPath += "/"; 053 _llSubmitCmd = binPath + _llSubmitCmd; 054 _llStatusCmd = binPath + _llStatusCmd; 055 _llDeleteCmd = binPath + _llDeleteCmd; 056 } 057 } 058 059 /** 060 * Create a submission file for the specific job manager, based on the 061 * information available in Job: - executable name - input files - output 062 * files - arguments for the job 063 */ 064 public boolean createSubmitFile(String filename, Job job) { 065 066 return false; 067 } 068 069 /** 070 * Submit command for LoadLeveler return: the command for submission 071 */ 072 public String getSubmitCmd(String submitFile, String options, Job job) throws JobException { 073 074 if(job.getDependentJobs() != null) { 075 throw new JobException("Support for job dependencies with Load Leveler has not been implemented."); 076 } 077 078 String _commandStr; 079 if (options != null) 080 _commandStr = _llSubmitCmd + " " + options + " " + submitFile; 081 else 082 _commandStr = _llSubmitCmd + " " + submitFile; 083 084 return _commandStr; 085 } 086 087 /** 088 * Parse output of submission and get information: jobID return String jobID 089 * on success throws JobException at failure (will contain the error stream 090 * or output stream) 091 */ 092 public String parseSubmitOutput(String output, String error) 093 throws JobException { 094 095 // System.out.println("====LoadLeveler parse: picking the jobid from output..."); 096 /* 097 * LoadLeveler llsubmit output is several lines: on success, there is a 098 * line: "llsubmit: The job "jobID" has been submitted." if submitfile 099 * does not exists or other error:?? 100 */ 101 String jobID = null; 102 103 String sa[] = output.split("\n"); // cut up lines 104 int idx; 105 for (int i = 0; i < sa.length; i++) { 106 // if (isDebugging) log.debug("LoadLeveler status string " + i + 107 // " = "+ sa[i]); 108 idx = sa[i].indexOf(" has been submitted"); 109 if (idx > -1) { 110 // Successful job submission, jobID is in this line. 111 // Cut to the second quote, excluding the quote. 112 String temp = output.substring(0, idx - 1); 113 // start of jobid string after the first quote 114 int qidx = output.indexOf("\""); 115 if (qidx > -1) { 116 // cut from the first quote, excluding the qoute 117 jobID = temp.substring(qidx + 1); 118 if (isDebugging) 119 log.debug("LoadLeveler parse: jobID = " + jobID 120 + " temp = " + temp); 121 } 122 } 123 } 124 125 if (jobID == null) { 126 if (error != null && error.length() > 0) 127 throw new JobException( 128 "Error at submission of LoadLeveler job: " + error); 129 else 130 throw new JobException( 131 "Error at submission of LoadLeveler job: " + output); 132 } 133 return jobID; 134 } // end-of-submit 135 136 /** 137 * Get the command to ask the status of the job return: the String of 138 * command 139 */ 140 public String getStatusCmd(String jobID) { 141 String _commandStr = _llStatusCmd + jobID; 142 return _commandStr; 143 } 144 145 /** 146 * Parse output of status check command and get status info return: a 147 * JobStatusInfo object, or throws an JobException with the error output 148 */ 149 public JobStatusInfo parseStatusOutput(String jobID, int exitCode, 150 String output, String error) throws JobException { 151 152 // LoadLeveler status prints to stdout always, and never to stderror. 153 // exitCode != 0 is error, but exitCode==0 still may mean that job is 154 // not in queue. 155 // If job is in queue, the formatted report looks like: 156 // Step Id Owner Queue Date ST 157 // ------------------------ ----------- ----------- -- 158 // s00601.287247.0 jxhan 07/12 09:10 NQ 159 // 160 // 1 job step(s) in query, 0 waiting, 0 pending, 0 running, 1 held, 0 161 // preempted 162 // 163 // If job is not in the queue anymore, the message is 164 // ""llq: There is currently no job status to report." 165 166 // System.out.println("+++++ status: picking the status from output" ); 167 JobStatusInfo stat = new JobStatusInfo(); 168 stat.statusCode = JobStatusCode.NotInQueue; 169 170 if (exitCode != 0) { 171 // error case, error text in output 172 throw new JobException("LoadLeveler status query error:\n" + output); 173 } 174 175 // now we have 0 exitCode, so either get status info, or no job message 176 177 boolean foundStatus = false; 178 String localJobID = createLocalJobID(jobID); // a trick for LoadLeveler 179 180 String sa[] = output.split("\n"); 181 for (int i = 0; i < sa.length; i++) { 182 // if (isDebugging) log.debug("LoadLeveler status string " + i + 183 // " = "+ sa[i]); 184 if (sa[i].trim().startsWith(localJobID)) { 185 String vals[] = sa[i].trim().split("( )+", 9); 186 if (vals.length >= 5) { 187 String reportedJobID = vals[0].trim(); 188 stat.owner = vals[1].trim(); 189 stat.submissionTime = vals[2].trim() + " " + vals[3].trim(); 190 stat.runTime = new String("N/A"); 191 String sts = vals[4].trim(); 192 193 if (sts.equals("R") || // running 194 sts.equals("ST") || // starting 195 sts.equals("P") || // pending 196 sts.equals("CK") || // checkpointing 197 sts.equals("CP") || // prepare to complete 198 sts.equals("C") || // completed 199 sts.equals("E") || // preempted 200 sts.equals("EP") || // preempt pending 201 sts.equals("MP") // resume pending 202 ) { 203 204 stat.statusCode = JobStatusCode.Running; 205 206 } else if (sts.equals("I") || // idle 207 sts.equals("NQ") || // not queued (for running) 208 sts.equals("HU") || // user hold 209 sts.equals("H") || // user hold 210 sts.equals("HS") || // system hold 211 sts.equals("S") || // system hold 212 sts.equals("D") || // deferred 213 sts.equals("V") || // vacated 214 sts.equals("VP") || // vacated pending 215 sts.equals("RP") // remove pending 216 ) { 217 218 stat.statusCode = JobStatusCode.Wait; 219 220 } else if (sts.equals("CA") || // cancelled 221 sts.equals("TX") || // terminated 222 sts.equals("RM") // removed 223 ) { 224 225 stat.statusCode = JobStatusCode.NotInQueue; 226 227 } else { 228 /* 229 * possible states: NR never run X rejected XP reject 230 * pending 231 */ 232 stat.statusCode = JobStatusCode.Error; 233 } 234 foundStatus = true; 235 if (isDebugging) 236 log.debug("LoadLeveler status Values: jobid=" 237 + stat.jobID + " owner=" + stat.owner 238 + " submissionTime=" + stat.submissionTime 239 + " status=[" + sts + "]"); 240 } 241 } 242 } 243 // System.out.println("LoadLeveler status = " + stat.statusCode); 244 245 if (!foundStatus) { 246 if (output != null && output.length() > 0) { 247 // it can be the message: llq: There is currently no job status 248 // to report. 249 if (output 250 .startsWith("llq: There is currently no job status to report.")) { 251 stat.statusCode = JobStatusCode.NotInQueue; 252 } else { 253 log.warn("Output string = [" + output + "] len=" 254 + output.length()); 255 stat.statusCode = JobStatusCode.Error; 256 } 257 } else { // unknown thing happened, output is null 258 throw new JobException( 259 "LoadLeveler status produced an unknown situation for job " 260 + jobID); 261 } 262 } 263 264 return stat; 265 } 266 267 /** 268 * Get the command to remove a job from queue (either running or waiting 269 * jobs). return: the String of command 270 */ 271 public String getDeleteCmd(String jobID) { 272 String _commandStr = _llDeleteCmd + jobID; 273 return _commandStr; 274 } 275 276 /** 277 * Parse output of delete command. return: true or false indicating that the 278 * command was successful or not 279 */ 280 public boolean parseDeleteOutput(String jobID, int exitCode, String output, 281 String error) throws JobException { 282 if (exitCode == 0) 283 return true; 284 else 285 return false; 286 } 287 288 /** 289 * Create the usable jobID "host.job.step" from the "fullhostname.job.step". 290 * Submission reports jobID with the full hostname, e.g. 291 * s00509.nersc.gov.410337.0 Status query / delete works for such ID, 292 * however, they report the id with short hostname, e.g. s00509.410337.0 so 293 * we need that short id to get the status. 294 */ 295 private String createLocalJobID(String fullJobID) { 296 String vals[] = fullJobID.trim().split("\\."); 297 if (vals.length <= 3) { 298 // our theory does not fit reality. Not NERSC? Just return as it is. 299 return fullJobID; 300 } 301 302 String id = new String (); 303 if (vals.length > 4) // for format s00509.nersc.gov.410337.0 304 id = vals[0] + "." + vals[vals.length - 2] + "." + vals[vals.length - 1]; 305 else // for format s00509.nersc.gov.410337 306 id = vals[0] + "." + vals[vals.length - 1]; 307 308 //System.out.println("full id = " + fullJobID + " job id = " + id); 309 return id; 310 } 311 312 // //////////////////////////////////////////////////////////////////// 313 // // private variables //// 314 315 // The combined command to execute. 316 private String _llSubmitCmd = "llsubmit "; 317 //some machines may NOT support -j option 318 //private String _llStatusCmd = "llq -f %id %o %dq %st -j "; 319 //-j looks like an optional option even on machines that support it 320 private String _llStatusCmd = "llq -f %id %o %dq %st "; 321 private String _llDeleteCmd = "llcancel "; 322 323 public String getTaskStatusCmd(String jobID) throws NotSupportedException { 324 throw new NotSupportedException("Task parallel jobs are not supported"); 325 } 326 327 public TaskParallelJobStatusInfo parseTaskStatusOutput(String jobID, 328 int numTasks, int exitCode, String output, String error) 329 throws JobException, NotSupportedException { 330 throw new NotSupportedException("Task parallel jobs are not supported"); 331 } 332 333} // end-of-class-JobSupportLoadLeveler