001/*
002 * Copyright (c) 2003-2010 The Regents of the University of California.
003 * All rights reserved.
004 *
005 * '$Author: welker $'
006 * '$Date: 2010-05-06 05:21:26 +0000 (Thu, 06 May 2010) $' 
007 * '$Revision: 24234 $'
008 * 
009 * Permission is hereby granted, without written agreement and without
010 * license or royalty fees, to use, copy, modify, and distribute this
011 * software and its documentation for any purpose, provided that the above
012 * copyright notice and the following two paragraphs appear in all copies
013 * of this software.
014 *
015 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
016 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
017 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
018 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
019 * SUCH DAMAGE.
020 *
021 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
022 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
023 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
024 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
025 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
026 * ENHANCEMENTS, OR MODIFICATIONS.
027 *
028 */
029
030package org.kepler.objectmanager.data.text;
031
032import java.io.InputStream;
033import java.util.Vector;
034
035import org.apache.commons.logging.Log;
036import org.apache.commons.logging.LogFactory;
037import org.kepler.objectmanager.data.db.Attribute;
038import org.kepler.objectmanager.data.db.Entity;
039import org.kepler.util.DelimitedReader;
040
041/**
042 * @author tao This class will read a data inputstream and split them into a row
043 *         vectors base on the given ComplexDataFormat array. This class have a
044 *         public method - getRowVector.After reach the end of stream, empty
045 *         vector will be returned. So this method can be iterated by a while
046 *         loop until a empty vector hited. During the iteration, every data in
047 *         the stream will be pulled out.
048 */
049public class TextComplexFormatDataReader {
050        private InputStream dataStream = null;
051        private Entity entity = null;
052        private boolean stripHeader = true;
053        private int numberOfAttirbute = 0;
054        private TextComplexDataFormat[] formats = null;
055        private String physicalLineDelimiter = null;
056        private int numberOfHeaderLines = 0;
057        private int physicalLineDelimiterLength = 0;
058        private int headLineNumberCount = 0;
059
060        private static Log log;
061
062        static {
063                log = LogFactory
064                                .getLog("org.kepler.objectmanager.data.text.TextComplexFormatDataReader");
065        }
066
067        // constants
068        public static final String DEFAULTVALUE = "";
069
070        /**
071         * Consturctor with default stripHeader value - true
072         * 
073         * @param dataStream
074         * @param entity
075         */
076        public TextComplexFormatDataReader(InputStream dataStream, Entity entity)
077                        throws Exception {
078                this(dataStream, entity, true);
079        }
080
081        /**
082         * Constructor with assigned stripHeader value
083         * 
084         * @param dataStream
085         *            the data input stream
086         * @param enity
087         *            the entity metadata to describe the data stream
088         * @param stripHeader
089         *            if strip header when we hand read the input stream
090         */
091        public TextComplexFormatDataReader(InputStream dataStream, Entity entity,
092                        boolean stripHeader) throws Exception {
093                if (dataStream == null || entity == null) {
094                        throw new Exception("Data inputstream or entity metadata is null");
095                }
096                this.dataStream = dataStream;
097                this.entity = entity;
098                this.stripHeader = stripHeader;
099                getParameterFromEntity();
100
101        }
102
103        /*
104         * Method to set up other parameter will be used in the reader. Sucha as
105         * numberOfArributes, physicalLineDelimiter.
106         */
107        private void getParameterFromEntity() throws Exception {
108                Attribute[] attributeList = entity.getAttributes();
109                if (attributeList == null) {
110                        throw new Exception("Attribute in entity metadata is null");
111                } else {
112                        numberOfAttirbute = attributeList.length;
113                }
114
115                numberOfHeaderLines = entity.getNumHeaderLines();
116                if (numberOfHeaderLines == -1) {
117                        numberOfHeaderLines = 0;
118                }
119
120                // physicalLineDelmiter will get from physicalDelimiter elements
121                // if no physicalDelimiter element, we will try record delimter
122                physicalLineDelimiter = entity.getPhysicalLineDelimiter();
123                if (physicalLineDelimiter == null) {
124                        physicalLineDelimiter = entity.getRecordDelimiter();
125                }
126                physicalLineDelimiter = DelimitedReader
127                                .unescapeDelimiter(physicalLineDelimiter);
128                physicalLineDelimiterLength = physicalLineDelimiter.length();
129
130                formats = entity.getDataFormatArray();
131                if (formats == null) {
132                        throw new Exception("Complext format is null in metadata entity");
133                } else {
134                        int length = formats.length;
135                        if (length != numberOfAttirbute) {
136                                throw new Exception(
137                                                "Complex formats should have same number as attribute number");
138                        }
139                }
140        }
141
142        /**
143         * This method will read one row from inputstream and return a data vector
144         * which element is String and the value is field data. After reach the end
145         * of stream, empty vector will be returned. So this method can be iterated
146         * by a while loop until a empty vector hited. During the iteration, every
147         * data in the stream will be pulled out.
148         * 
149         * @return Vector
150         */
151        public Vector getRowDataVectorFromStream() throws Exception {
152                Vector oneRowDataVector = new Vector();
153                StringBuffer lineDelimiterBuffer = new StringBuffer();// to store
154                                                                                                                                // delmiter
155                StringBuffer fieldValueBuffer = new StringBuffer();
156                int singleCharactor = -2;
157                int columnCount = 1;// this is for every character in one row
158                int attributeCount = 0; // this is for every attribute
159                boolean startNewAttribute = true;
160                boolean isWidthFix = true;
161                int width = -1;
162                int widthCount = 0;
163                boolean startWidthCount = false;
164                int startColumnNumberFromFormat = -1;
165                String fieldDelimiter = null;
166
167                if (dataStream != null) {
168                        singleCharactor = dataStream.read();
169
170                        while (singleCharactor != -1) {
171                                char charactor = (char) singleCharactor;
172                                // strip header
173                                if (stripHeader && numberOfHeaderLines > 0
174                                                && headLineNumberCount < numberOfHeaderLines) {
175                                        lineDelimiterBuffer.append(charactor);
176                                        if (lineDelimiterBuffer.length() == physicalLineDelimiterLength
177                                                        && lineDelimiterBuffer.toString().equals(
178                                                                        physicalLineDelimiter)) {
179                                                // reset the delimiter buffer
180                                                lineDelimiterBuffer = new StringBuffer();
181                                                headLineNumberCount++;
182                                        } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) {
183                                                // reset the delimiter buffer
184                                                lineDelimiterBuffer = new StringBuffer();
185                                        }
186
187                                } else {
188                                        // handle data after strip header
189                                        fieldValueBuffer.append(charactor);
190                                        lineDelimiterBuffer.append(charactor);
191
192                                        // set up format info
193                                        if (startNewAttribute) {
194                                                startNewAttribute = false;
195                                                // find the format from array
196                                                TextComplexDataFormat format = formats[attributeCount];
197                                                if (format == null) {
198                                                        throw new Exception(
199                                                                        "The text format is null for an attribute");
200                                                } else if (format instanceof TextWidthFixedDataFormat) {
201                                                        TextWidthFixedDataFormat widthFormat = (TextWidthFixedDataFormat) format;
202                                                        width = widthFormat.getFieldWidth();
203                                                        startColumnNumberFromFormat = widthFormat
204                                                                        .getFieldStartColumn();
205                                                        isWidthFix = true;
206                                                        startWidthCount = false;
207
208                                                } else if (format instanceof TextDelimitedDataFormat) {
209                                                        TextDelimitedDataFormat delimitedFormat = (TextDelimitedDataFormat) format;
210                                                        fieldDelimiter = delimitedFormat
211                                                                        .getFieldDelimiter();
212                                                        isWidthFix = false;
213                                                }
214                                        }
215
216                                        if (isWidthFix) {
217                                                // find start cloumn if metadata specify it
218                                                if (startColumnNumberFromFormat != -1
219                                                                && startColumnNumberFromFormat == columnCount) {
220                                                        fieldValueBuffer = new StringBuffer();
221                                                        fieldValueBuffer.append(charactor);
222                                                        startWidthCount = true;
223                                                } else if (startColumnNumberFromFormat == -1) {
224                                                        startWidthCount = true;
225                                                }
226                                                // start count width
227                                                if (startWidthCount) {
228                                                        widthCount++;
229                                                }
230                                                // we got the value when widthcount reach width of this
231                                                // format
232                                                if (widthCount == width) {
233                                                        String value = fieldValueBuffer.toString();
234                                                        log.debug("Add width fixed attribute value "
235                                                                        + value + " to the vector");
236                                                        oneRowDataVector.add(value.trim());
237                                                        widthCount = 0;
238                                                        startWidthCount = false;
239                                                        fieldValueBuffer = new StringBuffer();
240                                                        startNewAttribute = true;
241                                                        attributeCount++;
242                                                }
243
244                                        } else {
245                                                // for delimter data
246                                                if (fieldValueBuffer.toString()
247                                                                .endsWith(fieldDelimiter)) {
248                                                        String value = fieldValueBuffer.toString();
249                                                        value = value.substring(0, value.length()
250                                                                        - fieldDelimiter.length());
251                                                        log.debug("Add delimited attribute value " + value
252                                                                        + " to the vector");
253                                                        oneRowDataVector.add(value.trim());
254                                                        fieldValueBuffer = new StringBuffer();
255                                                        startNewAttribute = true;
256                                                        attributeCount++;
257                                                }
258                                        }
259
260                                        columnCount++;
261
262                                        // reset columnCount to 1 when hit a physical line delimiter
263                                        if (lineDelimiterBuffer.length() == physicalLineDelimiterLength
264                                                        && lineDelimiterBuffer.toString().equals(
265                                                                        physicalLineDelimiter)) {
266                                                // reset the delimiter buffer
267                                                lineDelimiterBuffer = new StringBuffer();
268                                                columnCount = 1;
269                                        } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) {
270                                                // reset the delimiter buffer
271                                                lineDelimiterBuffer = new StringBuffer();
272                                        }
273
274                                        // get a row vector break it.
275                                        if (attributeCount == numberOfAttirbute) {
276                                                break;
277                                        }
278                                }
279                                singleCharactor = dataStream.read();
280                        }
281
282                }
283                // if row vector is not empty and its length less than number of
284                // attribute,
285                // we should add "" string to make its' length equals attribute length;
286                if (!oneRowDataVector.isEmpty()
287                                && oneRowDataVector.size() < numberOfAttirbute) {
288                        int size = oneRowDataVector.size();
289                        for (int i = size; i < numberOfAttirbute; i++) {
290                                oneRowDataVector.add(DEFAULTVALUE);
291                        }
292                }
293                return oneRowDataVector;
294        }
295
296        /**
297         * @return Returns the dataStream.
298         */
299        public InputStream getDataStream() {
300                return dataStream;
301        }
302
303        /**
304         * @param dataStream
305         *            The dataStream to set.
306         */
307        public void setDataStream(InputStream dataStream) {
308                this.dataStream = dataStream;
309        }
310
311        /**
312         * @return Returns the entity.
313         */
314        public Entity getEntity() {
315                return entity;
316        }
317
318        /**
319         * @param entity
320         *            The entity to set.
321         */
322        public void setEntity(Entity entity) {
323                this.entity = entity;
324        }
325}