001/* An actor that validates fasta file. 002 003 Copyright (c) 2003-2010 The Regents of the University of California. 004 All rights reserved. 005 Permission is hereby granted, without written agreement and without 006 license or royalty fees, to use, copy, modify, and distribute this 007 software and its documentation for any purpose, provided that the above 008 copyright notice and the following two paragraphs appear in all copies 009 of this software. 010 011 IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY 012 FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 013 ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 014 THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF 015 SUCH DAMAGE. 016 017 THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 018 INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 019 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE 020 PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF 021 CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 022 ENHANCEMENTS, OR MODIFICATIONS. 023 024 PT_COPYRIGHT_VERSION_2 025 COPYRIGHTENDKEY 026 */ 027package org.camera.service; 028 029import java.io.BufferedReader; 030import java.io.FileNotFoundException; 031import java.io.FileReader; 032import java.io.PrintWriter; 033import java.io.StringWriter; 034import java.util.ArrayList; 035import java.util.List; 036 037import org.apache.commons.logging.Log; 038import org.apache.commons.logging.LogFactory; 039import org.biojava.bio.BioException; 040import org.biojava.bio.seq.io.ParseException; 041import org.biojavax.Namespace; 042import org.biojavax.SimpleNamespace; 043import org.biojavax.bio.seq.RichSequence; 044import org.biojavax.bio.seq.RichSequenceIterator; 045 046import ptolemy.actor.TypedAtomicActor; 047import ptolemy.actor.TypedIOPort; 048import ptolemy.actor.parameters.PortParameter; 049import ptolemy.data.BooleanToken; 050import ptolemy.data.StringToken; 051import ptolemy.data.expr.FileParameter; 052import ptolemy.data.expr.StringParameter; 053import ptolemy.data.type.BaseType; 054import ptolemy.kernel.CompositeEntity; 055import ptolemy.kernel.util.IllegalActionException; 056import ptolemy.kernel.util.NameDuplicationException; 057 058 059////////////////////////////////////////////////////////////////////////// 060//// FastaValidation 061 062/** 063 * 064 * This actor reads a fasta file and user can select whether it 065 * contains 'PROTEIN', 'DNA', or 'RNA' sequences. Actor informs user if file 066 * is valid or not or there are any parsing issues through a boolean variable. 067 * It makes use of biojava library and bytecode jar file. 068 * User can provide information how many sequences they need to be parsed. 069 * If user does not specify the number or provides a negative, or zero, or provides 070 * a string instead then actor will parse the entire file. 071 * 072 * Here if there is incorrect file format or there is problem with the first sequence 073 * then all the errors are not listed. However, it is an irrelevant matter when 074 * there is a file format problem. However, the two error conditions are coupled in 075 * biojava exception. 076 * 077 * @author Madhu, SDSC 078 * @version $Id: FastaValidation.java 31113 2012-11-26 22:19:36Z crawl $ 079 */ 080public class FastaValidation extends TypedAtomicActor { 081 /** Construct an actor with a name and a container. 082 * The container argument must not be null, or a 083 * NullPointerException will be thrown. 084 * @param container The container. 085 * @param name The name of this actor. 086 * @exception IllegalActionException If the container is incompatible 087 * with this actor. 088 * @exception NameDuplicationException If the name coincides with 089 * an actor already in the container. 090 */ 091 092 private static final long serialVersionUID = 1L; 093 private static Log log = LogFactory.getLog(FastaValidation.class); 094 095 /* drop down value*/ 096 private static final String PROTEIN = "Protein"; 097 private static final String RNA = "RNA"; 098 private static final String DNA = "DNA"; 099 private static final int MAXNOOFALLOWEDPROBLEMS = 25; 100 101 102 /** The Number of sequence to be parsed, if no positive integer is provided 103 * then all the sequences in file will be parsed. 104 */ 105 public PortParameter parseNumSequence; 106 107 /** Input file. */ 108 public TypedIOPort inputPortFilePath; 109 110 /** Input sequence type */ 111 public TypedIOPort inputPortType; 112 113 /** Output sequence */ 114 public TypedIOPort outputPortStatus; 115 116 /** Output message */ 117 public TypedIOPort outputPortMessage; 118 119 public FastaValidation(CompositeEntity container, String name) 120 throws NameDuplicationException, IllegalActionException { 121 122 super(container, name); 123 124 inputPortFilePath = new TypedIOPort(this, "inputPortFilePath", true, false); 125 inputPortType = new TypedIOPort(this, "inputPortType", true, false); 126 outputPortMessage = new TypedIOPort(this, "outputPortMessage", false, true); 127 outputPortStatus = new TypedIOPort(this, "outputPortStatus", false, true); 128 129 inputPortFilePath.setTypeEquals(BaseType.STRING); 130 inputPortType.setTypeEquals(BaseType.STRING); 131 outputPortStatus.setTypeEquals(BaseType.BOOLEAN); 132 outputPortMessage.setTypeEquals(BaseType.STRING); 133 134 fileParameter = new FileParameter(this, "fileOrURL"); 135 136 dropDownValue = new StringParameter(this, "dropDownValue"); 137 dropDownValue.addChoice(DNA); 138 dropDownValue.addChoice(RNA); 139 dropDownValue.addChoice(PROTEIN); 140 141 parseNumSequence = new PortParameter(this, "parseNumSequence"); 142 parseNumSequence.setExpression(""); 143 parseNumSequence.setStringMode(true); 144 145 146 _attachText("_iconDescription", "<svg>\n" 147 + "<rect x=\"-25\" y=\"-20\" " + "width=\"50\" height=\"40\" " 148 + "style=\"fill:white\"/>\n" 149 + "<polygon points=\"-15,-10 -12,-10 -8,-14 -1,-14 3,-10" 150 + " 15,-10 15,10, -15,10\" " + "style=\"fill:red\"/>\n" 151 + "</svg>\n"); 152 } 153 154 /////////////////////////////////////////////////////////////////// 155 //// public variables //// 156 157 /** The file's full path. 158 * @see FileParameter 159 */ 160 public FileParameter fileParameter; 161 162 /** Drop down values for choices to be selected. 163 * If user does not make a selection then 164 * default value of 'DNA' is used. 165 */ 166 public StringParameter dropDownValue; 167 168 169 private String getPortParamValue() throws IllegalActionException { 170 171 return ((StringToken) parseNumSequence.getToken()).stringValue() 172 .trim(); 173 174 }// end-method getPortParamValue() 175 176 /////////////////////////////////////////////////////////////////// 177 //// public methods //// 178 179 /** Output the data read from the file or URL as a string. 180 * @exception IllegalActionException If there is no director or 181 * if reading the file triggers an exception. 182 */ 183 public void fire() throws IllegalActionException { 184 super.fire(); 185 parseNumSequence.update(); 186 187 boolean monitor = true; 188 boolean parseAllSequences = true; 189 190 String filePath = null; 191 StringBuilder message = new StringBuilder(); 192 193 // If the fileOrURL input port is connected and has data, then 194 // get the file name from there. 195 196 197 String numOfSeqBeParsedValue = getPortParamValue(); 198 int numOfSeqBeParsed = -1; 199 if(!ServiceUtils.checkEmptyString(numOfSeqBeParsedValue)){ 200 201 try{ 202 numOfSeqBeParsed = Integer.parseInt(numOfSeqBeParsedValue); 203 if(numOfSeqBeParsed > 0){ 204 parseAllSequences = false; 205 } 206 }catch(NumberFormatException nfe){ 207 nfe.printStackTrace(); 208 } 209 210 } 211 212 213 if (inputPortFilePath.isOutsideConnected()) { 214 if (inputPortFilePath.hasToken(0)) { 215 String name = ((StringToken) inputPortFilePath.get(0)) 216 .stringValue(); 217 218 // Using setExpression() rather than setToken() allows 219 // the string to refer to variables defined in the 220 // scope of this actor. 221 fileParameter.setExpression(name); 222 223 } 224 } 225 filePath = fileParameter.getExpression(); 226 227 if(ServiceUtils.checkEmptyString(filePath)){ 228 message.append("NO FILE NAME PROVIDED"); 229 outputPortStatus.send(0, new BooleanToken(monitor)); 230 outputPortMessage.send(0, new StringToken(String.valueOf(message.toString()))); 231 232 log.debug("FILE PATH IS EMPTY"); 233 message = null; 234 return; 235 } 236 237 log.debug("FILE NAME: " + filePath); 238 239 String type = null; 240 if (inputPortType.isOutsideConnected()) { 241 if (inputPortType.hasToken(0)) { 242 type = ((StringToken) inputPortType.get(0)).stringValue(); 243 if(type.equalsIgnoreCase(PROTEIN)){ 244 dropDownValue.setExpression(PROTEIN); 245 }else if(type.equalsIgnoreCase(DNA)){ 246 dropDownValue.setExpression(DNA); 247 }else if(type.equalsIgnoreCase(RNA)){ 248 dropDownValue.setExpression(RNA); 249 } 250 } 251 } 252 253 if(ServiceUtils.checkEmptyString(dropDownValue.getExpression())){ 254 dropDownValue.setExpression(DNA); 255 message.append("NO SELECTION MADE FOR TYPE: DEFAULT VALUE IS SET TO DNA"); 256 }else{ 257 message.append("SELECTION MADE BY USER FOR TYPE: ").append(dropDownValue.getExpression()); 258 } 259 message.append(ServiceUtils.LINESEP); 260 261 type = dropDownValue.getExpression(); 262 263 log.debug("SELECTED VALUE FROM DROP DOWN: " + type); 264 265 BufferedReader br = null; 266 try{ 267 br = new BufferedReader(new FileReader(filePath)); 268 }catch (FileNotFoundException ex) { 269 //problem reading file 270 System.out.println("FILE NOT FOUND"); 271 ex.printStackTrace(); 272 System.exit(1); 273 } 274 275 Namespace nm = new SimpleNamespace("CAMERA"); 276//get a SequenceDB of all sequences in the file 277 RichSequenceIterator db = null; 278 if(type.equals(DNA)){ 279 db = RichSequence.IOTools.readFastaDNA(br, nm); //readFasta(is, alpha); 280 }else if(type.equals(RNA)){ 281 db = RichSequence.IOTools.readFastaRNA(br, nm); //readFasta(is, alpha); 282 }else if(type.equals(PROTEIN)){ 283 db = RichSequence.IOTools.readFastaProtein(br, nm); //readFasta(is, alpha); 284 } 285 286 int number_of_sequences = 0; 287 288 int icheck = 0; 289 int problems = 0; 290 List<String> allM = new ArrayList<String>(); 291 RichSequence rseq = null; 292 boolean fileStatus = true; 293 294 295 /** 296 * Here, I am attempting to read first sequence to check file status 297 * before proceeding forward. I have to do this because of 298 * clunky behavior or biojava. 299 */ 300 if( db.hasNext() ){ 301 try{ 302 rseq = db.nextRichSequence(); 303 number_of_sequences++; 304 }catch(BioException bioexcep){ 305 fileStatus = false; 306 System.out.println("FILE STATUS: " + fileStatus); 307 message.append("INCORRECT FILE FORMAT OR FILE EMPTY OR FIRST SEQ HAS PROBLEM"); 308 monitor = false; 309 } 310 } 311 312 313 while( db.hasNext() && fileStatus ){ 314 number_of_sequences++; 315 try{ 316// Sequence seq = db.nextSequence(); 317 rseq = db.nextRichSequence(); 318// log.debug("ACCESSION: " + rseq.getAccession()); //NAME, URN, & ACCESSION COULD BE VERY SIMILAR 319// log.debug("SEQUENCE-NAME: " + rseq.getURN()); 320// log.debug("SEQUENCE-ENGTH: " + rseq.length()); 321// log.debug("SEQUENCE: " + rseq.seqString()); 322 323/* 324 Annotation seqAn = rseq.getAnnotation(); 325 for (Iterator i = seqAn.keys().iterator(); i.hasNext(); ) { 326 Object key = i.next(); 327 Object value = seqAn.getProperty(key); 328 log.debug(key.toString() + ": " + value.toString()); 329 } 330*/ 331 332 //user controls how many sequences need o be parsed. 333 if(!parseAllSequences && number_of_sequences >= numOfSeqBeParsed){ 334 break; 335 } 336 337 }catch (ParseException parex) { 338 monitor = false; 339 message.append("INCORRECT FILE FORMAT OR FILE NOT PARSEABLE OR PROBLEM WITH SEQUENCES") 340 .append(ServiceUtils.LINESEP); 341 message.append("PROBLEM WITH SEQUENCE #: " + (number_of_sequences + 1)); 342// .append(ex.getMessage()); 343 //not in fasta format or wrong alphabet 344 parex.printStackTrace(); 345 346 }catch (BioException ex) { 347// message.append(ex.getMessage() + "\n"); 348 monitor = false; 349// message.append("HI"); 350 //no fasta sequences in the file 351 ex.printStackTrace(); 352 353 StringWriter writer = new StringWriter(); 354 ex.printStackTrace(new PrintWriter(writer)); 355 String trace = writer.toString(); 356 357 358 // The following line may give impression that the catching 359 // IOException should obviate the need for if block. But do 360 // not waste time trying because it does not work. The 361 // version 1.7.1 of Biojava currently available now is quirky. 362 if(trace.contains("IOException")){ 363 ++icheck; 364 if(icheck % 2 == 0){ 365 ++problems; 366 allM.add("Problem at sequence :: " + --number_of_sequences ); 367 } 368 }else{ 369 ++problems; 370 allM.add("Problem at sequence : " + number_of_sequences ); 371 } 372 if(problems > MAXNOOFALLOWEDPROBLEMS){ 373 message.append("Number of problems exceeded :" + MAXNOOFALLOWEDPROBLEMS); 374 break; 375 } 376 377 }catch (Throwable throwable) { 378 monitor = false; 379 message.append("IN THROWABLE\n"); 380 throwable.printStackTrace(); 381 } finally { 382 if (fileParameter != null) { 383 fileParameter.close(); 384 } 385 }//end try block 386 387 }//end while 388 389 if(allM.size() > 0){ 390 for(String m: allM){ 391 message.append(m + "\n"); 392 } 393 } 394 395 message.append("Number of sequences parsed from the file: ") 396 .append(String.valueOf(number_of_sequences)) 397 .append(ServiceUtils.LINESEP); 398 399 outputPortMessage.send(0, new StringToken(String.valueOf(message.toString()))); 400 outputPortStatus.send(0, new BooleanToken(monitor)); 401 402 log.debug("MESSAGE: " + message.toString()); 403 message = null; 404 405 }// end-method fire() 406}