001/* An actor that converts fastq format file to fasta format. 002 003 Copyright (c) 2003-2010 The Regents of the University of California. 004 All rights reserved. 005 Permission is hereby granted, without written agreement and without 006 license or royalty fees, to use, copy, modify, and distribute this 007 software and its documentation for any purpose, provided that the above 008 copyright notice and the following two paragraphs appear in all copies 009 of this software. 010 011 IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY 012 FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 013 ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 014 THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF 015 SUCH DAMAGE. 016 017 THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 018 INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 019 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE 020 PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF 021 CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 022 ENHANCEMENTS, OR MODIFICATIONS. 023 024 PT_COPYRIGHT_VERSION_2 025 COPYRIGHTENDKEY 026 */ 027 028package org.camera.service; 029 030import static org.biojavax.bio.seq.RichSequence.Tools.createRichSequence; 031 032import java.io.File; 033import java.io.FileInputStream; 034import java.io.FileNotFoundException; 035import java.io.FileOutputStream; 036import java.io.IOException; 037import java.io.InputStream; 038import java.util.zip.GZIPInputStream; 039 040import org.apache.commons.logging.Log; 041import org.apache.commons.logging.LogFactory; 042import org.biojava.bio.BioException; 043import org.biojava.bio.program.fastq.Fastq; 044import org.biojava.bio.program.fastq.FastqReader; 045import org.biojava.bio.program.fastq.IlluminaFastqReader; 046import org.biojava.bio.program.fastq.SangerFastqReader; 047import org.biojava.bio.program.fastq.SolexaFastqReader; 048import org.biojava.bio.seq.DNATools; 049import org.biojavax.SimpleNamespace; 050import org.biojavax.bio.seq.RichSequence; 051 052import ptolemy.actor.TypedAtomicActor; 053import ptolemy.actor.TypedIOPort; 054import ptolemy.actor.parameters.PortParameter; 055import ptolemy.data.BooleanToken; 056import ptolemy.data.StringToken; 057import ptolemy.data.expr.FileParameter; 058import ptolemy.data.expr.StringParameter; 059import ptolemy.data.type.BaseType; 060import ptolemy.kernel.CompositeEntity; 061import ptolemy.kernel.util.IllegalActionException; 062import ptolemy.kernel.util.NameDuplicationException; 063 064 065/** 066 * This actor converts fastq file to fasta format. It can take three 067 * different types of fastq files (Sanger, Solexa, and Illumina). However, 068 * user must defined the correct type of file else by default the 'Sanger' 069 * is selected from dropdown list. 070 * Actor can take regular fastq as well as gzip file for input, 071 * but the gzip file must have .gz extension. User should define the output 072 * file path else the output will be written to the user home directory with 073 * the same file prefix as input. User can define the name space if there 074 * is need to make modification to the sequence description else leave it 075 * empty. 076 * 077 * Actor has four input ports and two output ports. The message output port 078 * provides the general information about the input/output file. The status 079 * output port outputs true/false if the file processed correctly then 'true' 080 * will be emitted from the status port. 081 * 082 * Caution: Since biojava keeps the input file in memory, it is likely that 083 * this tool might break with very large files. It will help to gzip the fastq 084 * file to begin with. 085 * 086 * @author madhu 087 * @version $Id: Fastq2Fasta.java 31113 2012-11-26 22:19:36Z crawl $ 088 * 089 */ 090public class Fastq2Fasta extends TypedAtomicActor { 091 092 private static final long serialVersionUID = 1L; 093 private static final int DEFAULT_BUFFER_SIZE = 5096; 094 private static Log log = LogFactory.getLog(Fastq2Fasta.class); 095 096 /* drop down value*/ 097 private static final String SANGER = "Sanger"; 098 private static final String SOLEXA = "Solexa"; 099 private static final String ILLUMINA = "Illumina"; 100 101 /** 102 * Full path to the input file. 103 */ 104 public TypedIOPort inputFilePath; 105 106 /** 107 * Full path to output file. If this field is left empty 108 * then the output will be written to the home directory. 109 */ 110 public TypedIOPort outputFilePath; 111 112 /** Name space is a string that is added to the sequence description.*/ 113 public PortParameter nameSpace; 114 115 /** Input sequence data type */ 116 public TypedIOPort inputPortType; 117 118 /** Output status, it is boolean. */ 119 public TypedIOPort outputPortStatus; 120 121 /** Output message */ 122 public TypedIOPort outputPortMessage; 123 124 public Fastq2Fasta(CompositeEntity container, String name) 125 throws NameDuplicationException, IllegalActionException { 126 127 super(container, name); 128 129 inputFilePath = new TypedIOPort(this, "inputFilePath", true, false); 130 outputFilePath = new TypedIOPort(this, "outputFilePath", true, false); 131 inputPortType = new TypedIOPort(this, "inputPortType", true, false); 132 outputPortMessage = new TypedIOPort(this, "outputPortMessage", false, true); 133 outputPortStatus = new TypedIOPort(this, "outputPortStatus", false, true); 134 135 inputFilePath.setTypeEquals(BaseType.STRING); 136 outputFilePath.setTypeEquals(BaseType.STRING); 137 inputPortType.setTypeEquals(BaseType.STRING); 138 outputPortStatus.setTypeEquals(BaseType.BOOLEAN); 139 outputPortMessage.setTypeEquals(BaseType.STRING); 140 141 inputFileParameter = new FileParameter(this, "inputFile"); 142 outputFileParameter = new FileParameter(this, "outputFile"); 143 144 dropDownValue = new StringParameter(this, "dropDownValue"); 145 dropDownValue.addChoice(ILLUMINA); 146 dropDownValue.addChoice(SANGER); 147 dropDownValue.addChoice(SOLEXA); 148 149 nameSpace = new PortParameter(this, "nameSpace"); 150 nameSpace.setExpression(""); 151 nameSpace.setStringMode(true); 152 153 _attachText("_iconDescription", "<svg>\n" 154 + "<rect x=\"-25\" y=\"-20\" " + "width=\"50\" height=\"40\" " 155 + "style=\"fill:white\"/>\n" 156 + "<polygon points=\"-15,-10 -12,-10 -8,-14 -1,-14 3,-10" 157 + " 15,-10 15,10, -15,10\" " + "style=\"fill:red\"/>\n" 158 + "</svg>\n"); 159 } 160 161 /** Drop down values for choices to be selected. 162 * If user does not make a selection then 163 * default value of 'DNA' is used. 164 */ 165 public StringParameter dropDownValue; 166 167 168 /** The file's full path. 169 * @see FileParameter 170 */ 171 public FileParameter inputFileParameter; 172 173 /** The output file's full path. 174 * @see FileParameter 175 */ 176 public FileParameter outputFileParameter; 177 178 /** 179 * 180 */ 181 public void fire() throws IllegalActionException { 182 super.fire(); 183 184 boolean monitor = false; 185 186 String inFilePath = null; 187 String outFilePath = null; 188 String type = null; 189 String nmSpace = nameSpace.getExpression().trim(); 190 StringBuilder message = new StringBuilder(); 191 192 if (inputFilePath.isOutsideConnected()) { 193 if (inputFilePath.hasToken(0)) { 194 String name = ((StringToken) inputFilePath.get(0)) 195 .stringValue(); 196 197 // Using setExpression() rather than setToken() allows 198 // the string to refer to variables defined in the 199 // scope of this actor. 200 inputFileParameter.setExpression(name); 201 202 } 203 } 204 inFilePath = inputFileParameter.getExpression(); 205 206 if (outputFilePath.isOutsideConnected()) { 207 if (outputFilePath.hasToken(0)) { 208 String name = ((StringToken) outputFilePath.get(0)) 209 .stringValue(); 210 211 // Using setExpression() rather than setToken() allows 212 // the string to refer to variables defined in the 213 // scope of this actor. 214 outputFileParameter.setExpression(name); 215 216 } 217 } 218 outFilePath = outputFileParameter.getExpression(); 219 220 if (inputPortType.isOutsideConnected()) { 221 if (inputPortType.hasToken(0)) { 222 type = ((StringToken) inputPortType.get(0)).stringValue(); 223 if(type.equalsIgnoreCase(SANGER)){ 224 dropDownValue.setExpression(SANGER); 225 }else if(type.equalsIgnoreCase(ILLUMINA)){ 226 dropDownValue.setExpression(ILLUMINA); 227 }else if(type.equalsIgnoreCase(SOLEXA)){ 228 dropDownValue.setExpression(SOLEXA); 229 } 230 } 231 } 232 233 if(ServiceUtils.checkEmptyString(dropDownValue.getExpression())){ 234 dropDownValue.setExpression(SANGER); 235 message.append("NO SELECTION MADE FOR TYPE: DEFAULT VALUE IS SET TO SANGER"); 236 }else{ 237 message.append("SELECTION MADE BY USER FOR TYPE: ").append(dropDownValue.getExpression()); 238 } 239 message.append(ServiceUtils.LINESEP); 240 241 type = dropDownValue.getExpression(); 242 InputStream insrdr = null; 243 try{ 244 245 if(inFilePath.endsWith(".gz")){ 246 insrdr = new GZIPInputStream(new FileInputStream(inFilePath),DEFAULT_BUFFER_SIZE); 247 }else{ 248 insrdr = new FileInputStream(inFilePath); 249 } 250 message.append("Input file: " + inFilePath + ServiceUtils.LINESEP); 251 252 FastqReader qReader = null; 253 254 if(type.equals("Illumuna")){ 255 qReader = new IlluminaFastqReader(); 256 }else if(type.equals("Solexa")){ 257 qReader = new SolexaFastqReader(); 258 }else{ 259 qReader = new SangerFastqReader(); 260 } 261 262 if(ServiceUtils.checkEmptyString(outFilePath)){ 263 String homeDir = System.getProperty("user.home"); 264 String fName = inFilePath.replaceAll("^.*" + File.separator, "").split("[.]")[0]; 265 outFilePath = homeDir + File.separator + fName + ".fasta"; 266 } 267 message.append("Output file: " + outFilePath + ServiceUtils.LINESEP); 268 269 FileOutputStream outputFasta = new FileOutputStream(outFilePath); 270 271 int numOfSequences = 0; 272 for (Fastq fastq : qReader.read(insrdr)) { 273 numOfSequences++; 274 String sequence = fastq.getSequence(); 275 SimpleNamespace ns = new SimpleNamespace(nmSpace); 276 RichSequence richSequence = createRichSequence(ns, fastq.getDescription(), sequence, DNATools.getDNA()); 277 RichSequence.IOTools.writeFasta(outputFasta, richSequence, ns); 278 } 279 message.append("Total # of sequences in a file: " + numOfSequences + ServiceUtils.LINESEP); 280 monitor = true; 281 }catch(FileNotFoundException fnfe){ 282 fnfe.printStackTrace(); 283 log.debug("Input file does not exist.\n"); 284 //message.append("Input file does not exist.\n"); 285 throw new IllegalArgumentException("Input file does not exist!"); 286 }catch(IOException ioe){ 287 ioe.printStackTrace(); 288 throw new IllegalArgumentException("IOException thrown!"); 289 }catch(BioException bioe){ 290 bioe.printStackTrace(); 291 throw new IllegalArgumentException("BioException thrown!"); 292 } 293 System.out.println("DONE"); 294 295 outputPortMessage.send(0, new StringToken(String.valueOf(message.toString()))); 296 outputPortStatus.send(0, new BooleanToken(monitor)); 297 298 log.debug("MESSAGE: " + message.toString()); 299 message = null; 300 } 301 302}