001/* An actor that validates fasta file.
002
003 Copyright (c) 2003-2010 The Regents of the University of California.
004 All rights reserved.
005 Permission is hereby granted, without written agreement and without
006 license or royalty fees, to use, copy, modify, and distribute this
007 software and its documentation for any purpose, provided that the above
008 copyright notice and the following two paragraphs appear in all copies
009 of this software.
010
011 IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
012 FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
013 ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
014 THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
015 SUCH DAMAGE.
016
017 THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
018 INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
019 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
020 PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
021 CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
022 ENHANCEMENTS, OR MODIFICATIONS.
023
024 PT_COPYRIGHT_VERSION_2
025 COPYRIGHTENDKEY
026 */
027package org.camera.service;
028
029import java.io.BufferedReader;
030import java.io.FileNotFoundException;
031import java.io.FileReader;
032import java.io.PrintWriter;
033import java.io.StringWriter;
034import java.util.ArrayList;
035import java.util.List;
036
037import org.apache.commons.logging.Log;
038import org.apache.commons.logging.LogFactory;
039import org.biojava.bio.BioException;
040import org.biojava.bio.seq.io.ParseException;
041import org.biojavax.Namespace;
042import org.biojavax.SimpleNamespace;
043import org.biojavax.bio.seq.RichSequence;
044import org.biojavax.bio.seq.RichSequenceIterator;
045
046import ptolemy.actor.TypedAtomicActor;
047import ptolemy.actor.TypedIOPort;
048import ptolemy.actor.parameters.PortParameter;
049import ptolemy.data.BooleanToken;
050import ptolemy.data.StringToken;
051import ptolemy.data.expr.FileParameter;
052import ptolemy.data.expr.StringParameter;
053import ptolemy.data.type.BaseType;
054import ptolemy.kernel.CompositeEntity;
055import ptolemy.kernel.util.IllegalActionException;
056import ptolemy.kernel.util.NameDuplicationException;
057
058
059//////////////////////////////////////////////////////////////////////////
060//// FastaValidation
061
062/**
063 *
064 * This actor reads a fasta file and user can select whether it
065 * contains 'PROTEIN', 'DNA', or 'RNA' sequences. Actor informs user if file
066 * is valid or not or there are any parsing issues through a boolean variable.
067 * It makes use of biojava library and bytecode jar file.
068 * User can provide information how many sequences they need to be parsed.
069 * If user does not specify the number or provides a negative, or zero, or provides
070 * a string instead then actor will parse the entire file.
071 * 
072 * Here if there is incorrect file format or there is problem with the first sequence
073 * then all the errors are not listed. However, it is an irrelevant matter when 
074 * there is a file format problem. However, the two error conditions are coupled in
075 * biojava exception.
076 * 
077 * @author Madhu, SDSC 
078 * @version $Id: FastaValidation.java 31113 2012-11-26 22:19:36Z crawl $
079 */
080public class FastaValidation extends TypedAtomicActor {
081    /** Construct an actor with a name and a container.
082     *  The container argument must not be null, or a
083     *  NullPointerException will be thrown.
084     *  @param container The container.
085     *  @param name The name of this actor.
086     *  @exception IllegalActionException If the container is incompatible
087     *   with this actor.
088     *  @exception NameDuplicationException If the name coincides with
089     *   an actor already in the container.
090     */
091        
092        private static final long serialVersionUID = 1L;
093        private static Log log = LogFactory.getLog(FastaValidation.class);
094        
095        /* drop down value*/    
096        private static final String PROTEIN = "Protein";
097        private static final String RNA = "RNA";
098        private static final String DNA = "DNA";
099        private static final int MAXNOOFALLOWEDPROBLEMS = 25;
100                
101    
102    /** The  Number of sequence to be parsed, if no positive integer is provided
103     * then all the sequences in file will be parsed.
104     */
105        public PortParameter parseNumSequence;
106        
107        /** Input file. */
108        public TypedIOPort inputPortFilePath;
109        
110        /** Input sequence type */
111        public TypedIOPort inputPortType;
112        
113        /** Output sequence */
114        public TypedIOPort outputPortStatus;
115        
116        /** Output message */
117        public TypedIOPort outputPortMessage;
118        
119        public FastaValidation(CompositeEntity container,  String name) 
120                throws NameDuplicationException, IllegalActionException  {
121                
122        super(container, name);
123
124        inputPortFilePath = new TypedIOPort(this, "inputPortFilePath", true, false);
125        inputPortType = new TypedIOPort(this, "inputPortType", true, false);            
126                outputPortMessage = new TypedIOPort(this, "outputPortMessage", false, true);
127                outputPortStatus = new TypedIOPort(this, "outputPortStatus", false, true);
128                
129                inputPortFilePath.setTypeEquals(BaseType.STRING);
130                inputPortType.setTypeEquals(BaseType.STRING);
131                outputPortStatus.setTypeEquals(BaseType.BOOLEAN);
132                outputPortMessage.setTypeEquals(BaseType.STRING);
133                
134                fileParameter = new FileParameter(this, "fileOrURL");
135                
136                dropDownValue = new StringParameter(this, "dropDownValue");
137        dropDownValue.addChoice(DNA);
138        dropDownValue.addChoice(RNA); 
139        dropDownValue.addChoice(PROTEIN); 
140        
141        parseNumSequence = new PortParameter(this, "parseNumSequence");
142                parseNumSequence.setExpression("");
143                parseNumSequence.setStringMode(true);
144        
145
146        _attachText("_iconDescription", "<svg>\n"
147                + "<rect x=\"-25\" y=\"-20\" " + "width=\"50\" height=\"40\" "
148                + "style=\"fill:white\"/>\n"
149                + "<polygon points=\"-15,-10 -12,-10 -8,-14 -1,-14 3,-10"
150                + " 15,-10 15,10, -15,10\" " + "style=\"fill:red\"/>\n"
151                + "</svg>\n");
152    }
153
154    ///////////////////////////////////////////////////////////////////
155    ////                         public variables                  ////
156
157    /** The file's full path.
158     *  @see FileParameter
159     */
160    public FileParameter fileParameter;
161
162    /** Drop down values for choices to be selected. 
163         *  If user does not make a selection then 
164         *  default value of 'DNA' is used.
165         */
166    public StringParameter dropDownValue;
167   
168
169    private String getPortParamValue() throws IllegalActionException {
170
171                return ((StringToken) parseNumSequence.getToken()).stringValue()
172                                        .trim();
173
174        }// end-method getPortParamValue()
175    
176    ///////////////////////////////////////////////////////////////////
177    ////                         public methods                    ////
178
179    /** Output the data read from the file or URL as a string.
180     *  @exception IllegalActionException If there is no director or
181     *   if reading the file triggers an exception.
182     */
183    public void fire() throws IllegalActionException {
184        super.fire();
185        parseNumSequence.update();
186        
187        boolean monitor = true;
188        boolean parseAllSequences = true;
189        
190        String filePath = null;
191        StringBuilder message = new StringBuilder();
192
193        // If the fileOrURL input port is connected and has data, then
194        // get the file name from there.
195               
196        
197        String numOfSeqBeParsedValue = getPortParamValue();
198        int numOfSeqBeParsed = -1;
199        if(!ServiceUtils.checkEmptyString(numOfSeqBeParsedValue)){
200                     
201                try{
202                        numOfSeqBeParsed = Integer.parseInt(numOfSeqBeParsedValue);
203                        if(numOfSeqBeParsed > 0){
204                                parseAllSequences = false;
205                        }
206                }catch(NumberFormatException nfe){
207                        nfe.printStackTrace();
208                }
209                
210        }
211                                   
212                
213        if (inputPortFilePath.isOutsideConnected()) {
214            if (inputPortFilePath.hasToken(0)) {
215                String name = ((StringToken) inputPortFilePath.get(0))
216                        .stringValue();
217
218                // Using setExpression() rather than setToken() allows
219                // the string to refer to variables defined in the
220                // scope of this actor.
221                fileParameter.setExpression(name);
222                
223            }
224        }
225        filePath = fileParameter.getExpression();       
226        
227        if(ServiceUtils.checkEmptyString(filePath)){
228                message.append("NO FILE NAME PROVIDED");
229                outputPortStatus.send(0, new BooleanToken(monitor));
230                outputPortMessage.send(0, new StringToken(String.valueOf(message.toString())));
231                
232                log.debug("FILE PATH IS EMPTY");
233            message = null;           
234                return;         
235        }
236        
237        log.debug("FILE NAME: " + filePath);
238        
239        String type = null;
240        if (inputPortType.isOutsideConnected()) {
241                if (inputPortType.hasToken(0)) {
242                type = ((StringToken) inputPortType.get(0)).stringValue();
243                if(type.equalsIgnoreCase(PROTEIN)){
244                        dropDownValue.setExpression(PROTEIN);
245                }else if(type.equalsIgnoreCase(DNA)){
246                        dropDownValue.setExpression(DNA);
247                }else if(type.equalsIgnoreCase(RNA)){
248                        dropDownValue.setExpression(RNA);
249                }
250                }
251        }
252        
253        if(ServiceUtils.checkEmptyString(dropDownValue.getExpression())){
254                dropDownValue.setExpression(DNA);
255                message.append("NO SELECTION MADE FOR TYPE: DEFAULT VALUE IS SET TO DNA");
256        }else{
257                message.append("SELECTION MADE BY USER FOR TYPE: ").append(dropDownValue.getExpression());
258        }
259        message.append(ServiceUtils.LINESEP);
260        
261        type = dropDownValue.getExpression();
262        
263        log.debug("SELECTED VALUE FROM DROP DOWN: " + type);
264        
265        BufferedReader br = null;
266                try{
267                        br = new BufferedReader(new FileReader(filePath));
268                }catch (FileNotFoundException ex) {
269                        //problem reading file
270                        System.out.println("FILE NOT FOUND");
271                        ex.printStackTrace();
272                        System.exit(1);
273                }
274                                        
275        Namespace nm = new SimpleNamespace("CAMERA");
276//get a SequenceDB of all sequences in the file
277        RichSequenceIterator db = null;
278        if(type.equals(DNA)){
279                db = RichSequence.IOTools.readFastaDNA(br, nm); //readFasta(is, alpha);
280        }else if(type.equals(RNA)){
281                db = RichSequence.IOTools.readFastaRNA(br, nm); //readFasta(is, alpha);
282        }else if(type.equals(PROTEIN)){
283                db = RichSequence.IOTools.readFastaProtein(br, nm); //readFasta(is, alpha);
284        }
285                
286        int number_of_sequences = 0;
287        
288        int icheck = 0;
289        int problems = 0;
290        List<String> allM = new ArrayList<String>();
291        RichSequence rseq = null;
292        boolean fileStatus = true;
293        
294        
295        /**
296         * Here, I am attempting to read first sequence to check file status
297         * before proceeding forward. I have to do this because of
298         * clunky behavior or biojava.
299         */
300        if( db.hasNext() ){
301                try{
302                        rseq = db.nextRichSequence();
303                        number_of_sequences++;
304                }catch(BioException bioexcep){
305                        fileStatus = false;
306                        System.out.println("FILE STATUS: " + fileStatus);
307                        message.append("INCORRECT FILE FORMAT OR FILE EMPTY OR FIRST SEQ HAS PROBLEM");
308                        monitor = false;
309                }
310        }
311     
312       
313        while( db.hasNext() && fileStatus ){
314                number_of_sequences++;
315                try{
316//                      Sequence seq = db.nextSequence();
317                        rseq = db.nextRichSequence();
318//                      log.debug("ACCESSION: " + rseq.getAccession()); //NAME, URN, & ACCESSION COULD BE VERY SIMILAR
319//                      log.debug("SEQUENCE-NAME: " + rseq.getURN());
320//                      log.debug("SEQUENCE-ENGTH: " + rseq.length());
321//                      log.debug("SEQUENCE: " + rseq.seqString());
322                        
323/*                                              
324                        Annotation seqAn = rseq.getAnnotation();
325                        for (Iterator i = seqAn.keys().iterator(); i.hasNext(); ) {
326                                Object key = i.next();
327                                Object value = seqAn.getProperty(key);
328                                log.debug(key.toString() + ": " + value.toString());
329                        }
330*/                      
331                         
332                        //user controls how many sequences need o be parsed.
333                        if(!parseAllSequences && number_of_sequences >=  numOfSeqBeParsed){
334                                break;
335                        }
336                        
337                }catch (ParseException parex) {
338                        monitor = false;
339                message.append("INCORRECT FILE FORMAT OR FILE NOT PARSEABLE OR PROBLEM WITH SEQUENCES")
340                .append(ServiceUtils.LINESEP);
341                message.append("PROBLEM WITH SEQUENCE #: " + (number_of_sequences + 1));
342//              .append(ex.getMessage());
343                //not in fasta format or wrong alphabet
344                parex.printStackTrace();
345                        
346            }catch (BioException ex) {
347//              message.append(ex.getMessage() + "\n");
348                monitor = false;
349//              message.append("HI");
350                //no fasta sequences in the file
351                ex.printStackTrace();
352                
353                StringWriter writer = new StringWriter();
354                ex.printStackTrace(new PrintWriter(writer));
355                String trace = writer.toString();
356                
357                    
358                // The following line may give impression that the catching
359                // IOException should obviate the need for if block. But do
360                // not waste time trying because it does not work. The
361                // version 1.7.1 of Biojava currently available now is quirky.
362                if(trace.contains("IOException")){
363                        ++icheck;
364                        if(icheck % 2 == 0){
365                                ++problems;
366                                allM.add("Problem at sequence :: " + --number_of_sequences );
367                        }                               
368                }else{
369                        ++problems;
370                        allM.add("Problem at sequence : " + number_of_sequences );
371                }
372                if(problems > MAXNOOFALLOWEDPROBLEMS){
373                        message.append("Number of problems exceeded :" + MAXNOOFALLOWEDPROBLEMS);
374                        break;
375                }
376
377            }catch (Throwable throwable) {
378                monitor = false;
379                message.append("IN THROWABLE\n");
380                        throwable.printStackTrace();            
381            } finally {
382                if (fileParameter != null) {
383                    fileParameter.close();
384                }
385            }//end try block
386                        
387        }//end while
388        
389        if(allM.size() > 0){
390                for(String m: allM){
391                        message.append(m + "\n");                               
392                }
393        }  
394        
395        message.append("Number of sequences parsed from the file: ")
396                .append(String.valueOf(number_of_sequences))
397                .append(ServiceUtils.LINESEP);
398        
399        outputPortMessage.send(0, new StringToken(String.valueOf(message.toString())));
400        outputPortStatus.send(0, new BooleanToken(monitor));
401        
402        log.debug("MESSAGE: " + message.toString());
403        message = null;
404        
405    }// end-method fire()
406}