001/* 002 * Copyright (c) 2004-2010 The Regents of the University of California. 003 * All rights reserved. 004 * 005 * 006 * 007 * Permission is hereby granted, without written agreement and without 008 * license or royalty fees, to use, copy, modify, and distribute this 009 * software and its documentation for any purpose, provided that the above 010 * copyright notice and the following two paragraphs appear in all copies 011 * of this software. 012 * 013 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY 014 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 015 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 016 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF 017 * SUCH DAMAGE. 018 * 019 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 020 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 021 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE 022 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF 023 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 024 * ENHANCEMENTS, OR MODIFICATIONS. 025 * 026 */ 027 028package org.kepler.job; 029 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032 033/** 034 * Support class for SLURM job manager support Class Job uses the methods of a 035 * supporter class to submit jobs and check status 036 */ 037public class JobSupportSLURM implements JobSupport { 038 039 private static final Log log = LogFactory.getLog(JobSupportSLURM.class 040 .getName()); 041 private static final boolean isDebugging = log.isDebugEnabled(); 042 043 public JobSupportSLURM() { 044 } 045 046 public void init(String slurmBinPath) { 047 if (slurmBinPath != null && !slurmBinPath.trim().equals("")) { 048 String binPath = new String(slurmBinPath); 049 if (!slurmBinPath.endsWith("/")) 050 binPath += "/"; 051 _slurmSubmitCmd = binPath + _slurmSubmitCmd; 052 _slurmStatusCmd = binPath + _slurmStatusCmd; 053 _slurmDeleteCmd = binPath + _slurmDeleteCmd; 054 } 055 } 056 057 /** 058 * Create a submission file for the specific job manager, based on the 059 * information available in Job: - executable name - input files - output 060 * files - arguments for the job 061 */ 062 public boolean createSubmitFile(String filename, Job job) { 063 064 return false; 065 } 066 067 /** 068 * Submit command for SLURM return: the command for submission 069 */ 070 public String getSubmitCmd(String submitFile, String options, Job job) throws JobException { 071 072 StringBuffer _commandStr = new StringBuffer(_slurmSubmitCmd); 073 074 // see if there are any dependent jobs 075 Job[] dependentJobs = job.getDependentJobs(); 076 if(dependentJobs != null) { 077 _commandStr.append("--dependency=afterok"); 078 for(Job dependentJob : dependentJobs) { 079 _commandStr.append(":" + dependentJob.status.jobID); 080 } 081 } 082 083 if (options != null) { 084 _commandStr.append(" " + options); 085 } 086 087 _commandStr.append(" " + submitFile); 088 089 return _commandStr.toString(); 090 } 091 092 /** 093 * Parse output of submission and get information: jobID return String jobID 094 * on success throws JobException at failure (will contain the error stream 095 * or output stream) 096 */ 097 public String parseSubmitOutput(String output, String error) 098 throws JobException { 099 100 101 102 // System.out.println("====SLURM parse: picking the jobid from output..."); 103 /* 104 * SLURM sbatch output: on success, it is: Submitted batch job 8125 105 * Your job 102368 ("sge.cmd") has been submitted. 106 * on error, messages are printed on stderr, stdout is 107 * empty 108 */ 109 110 String jobID = null; 111 String lines[] = output.split("\\r?\\n"); 112 int max = lines.length; 113 String jobIDLine = null; 114 for(int i=0; i<max; i++) { 115 if (lines[i].matches("Submitted batch job [0-9]*.*")) { 116 jobIDLine = lines[i]; 117 System.out.println("Debug Print JOBIDLine " + jobIDLine); 118 } 119 } 120 121 122 //if (idx > -1) { 123 124 //String firstrow = output.substring(0, idx); 125 //System.out.println("Debugging :::" + output.substring(output.length()-8, output.length()) +"<<<<"); 126 //if (firstrow.matches("Submitted batch job [0-9]*.*")) { 127 128 String s = jobIDLine.substring(20, jobIDLine.length()); 129 //int toIdx = idx; 130 jobID = s; 131 //test 132 //System.out.println("JOBID debug --s print " + s); 133 //System.out.println("JOBID rectify --idx print " + idx); 134 //System.out.println("JOBID rectify --jobID print " + jobID); 135 //System.out.println("JOBID rectify --firstrow print " + firstrow); 136 //test 137 138 if (isDebugging) 139 log.debug("SLURM parse: jobID = " + jobID + " jobIDLine = " 140 + jobIDLine); 141 142 143 if (jobID == null) { 144 if (error != null && error.length() > 0) 145 throw new JobException("Error at submission of SLURM job: " 146 + error); 147 else 148 throw new JobException("Error at submission of SLURM job: " 149 + output); 150 } 151 152 //test 153 //System.out.println("JOBID rectify --jobID print " + jobID); 154 155 //test 156 157 return jobID; 158 } // end-of-submit 159 160 /** 161 * Get the command to ask the status of the job return: the String of 162 * command 163 */ 164 public String getStatusCmd(String jobID) { 165 String _commandStr = _slurmStatusCmd + jobID; 166 return _commandStr; 167 } 168 169 /** 170 * Parse output of status check command and get status info return: a 171 * JobStatusInfo object, or throws an JobException with the error output 172 */ 173 public JobStatusInfo parseStatusOutput(String jobID, int exitCode, 174 String output, String error) throws JobException { 175 176 // SLURM status does not use exitCode. It can show error, but in real it 177 // can mean only that 178 // job is not in the queue anymore, which is good... 179 180 // System.out.println("+++++ status: picking the status from output" ); 181 JobStatusInfo stat = new JobStatusInfo(); 182 stat.statusCode = JobStatusCode.NotInQueue; 183 184 boolean foundStatus = false; 185 186 String sa[] = output.split("\n"); 187 188 int idx; 189 for (int i = 0; i < sa.length; i++) { 190 // System.out.println("SLURM status string " + i + " = "+ sa[i]); 191 String vals[] = sa[i].trim().split("( )+", 9); 192 if (jobID.startsWith(vals[0].trim())) { // jobID may be longer than 193 // the first field which is 194 // limited in length 195 if (vals.length >= 5) { 196 stat.jobID = jobID; 197 String jobName = vals[2].trim(); 198 stat.owner = vals[3].trim(); 199 stat.runTime = vals[5].trim(); 200 String sts = vals[4].trim(); 201 switch (sts) { 202 case "CD": 203 stat.statusCode = JobStatusCode.NotInQueue; 204 break; 205 case "CG": 206 case "R": 207 stat.statusCode = JobStatusCode.Running; 208 break; 209 case "S": 210 case "Q": 211 case "W": 212 case "PD": 213 stat.statusCode = JobStatusCode.Wait; 214 break; 215 default: 216 stat.statusCode = JobStatusCode.Wait; 217 } 218 foundStatus = true; 219 if (isDebugging) 220 log.debug("SLURM status Values: jobid=" + stat.jobID 221 + " owner=" + stat.owner + " runTime=" 222 + stat.runTime + " status=[" + sts + "]"); 223 } 224 } 225 } 226 // System.out.println("SLURM status = " + stat.statusCode); 227 228 if (!foundStatus) { 229 if (error != null && error.length() > 0) { 230 // it can be the message: qstat: Unknown Job Id ... 231 if (error.startsWith("SLURM: Unknown Job Id")) { 232 stat.jobID = jobID; 233 stat.statusCode = JobStatusCode.NotInQueue; 234 } else { 235 log.warn("Error string = [" + error + "] len=" 236 + error.length()); 237 stat.jobID = jobID; 238 stat.statusCode = JobStatusCode.Error; 239 } 240 } else { // not an error, just job is not in the job queue now 241 stat.jobID = jobID; 242 stat.statusCode = JobStatusCode.NotInQueue; 243 } 244 } 245 246 return stat; 247 } 248 249 /** 250 * Get the command to remove a job from queue (either running or waiting 251 * jobs). return: the String of command 252 */ 253 public String getDeleteCmd(String jobID) { 254 String _commandStr = _slurmDeleteCmd + jobID; 255 return _commandStr; 256 } 257 258 /** 259 * Parse output of delete command. return: true or false indicating that the 260 * command was successful or not 261 */ 262 public boolean parseDeleteOutput(String jobID, int exitCode, String output, 263 String error) throws JobException { 264 if (exitCode == 0) 265 return true; 266 else 267 return false; 268 } 269 270 // //////////////////////////////////////////////////////////////////// 271 // // private variables //// 272 273 // The combined command to execute. 274 private String _slurmSubmitCmd = "sbatch "; 275 private String _slurmStatusCmd = "squeue --job="; 276 private String _slurmDeleteCmd = "scancel "; 277 278 public String getTaskStatusCmd(String jobID) throws NotSupportedException { 279 //return job status command as SLURM doesn't support task status command 280 return getStatusCmd(jobID); 281 } 282 283 public TaskParallelJobStatusInfo parseTaskStatusOutput(String jobID, 284 int numTasks, int exitCode, String output, String error) 285 throws JobException, NotSupportedException { 286 JobStatusInfo jobstatus = parseStatusOutput(jobID, exitCode, output, error); 287 TaskParallelJobStatusInfo taskStatus = new TaskParallelJobStatusInfo(jobstatus); 288 for(int i=0;i<numTasks;i++){ 289 taskStatus.taskStatusCodes.put(Integer.toString(i), jobstatus.statusCode); 290 } 291 return taskStatus; 292 } 293 294} // end-of-class-JobSupportSLURM