001/*
002 * Copyright (c) 2003-2010 The Regents of the University of California.
003 * All rights reserved.
004 *
005 * '$Author: welker $'
006 * '$Date: 2010-05-06 05:21:26 +0000 (Thu, 06 May 2010) $' 
007 * '$Revision: 24234 $'
008 * 
009 * Permission is hereby granted, without written agreement and without
010 * license or royalty fees, to use, copy, modify, and distribute this
011 * software and its documentation for any purpose, provided that the above
012 * copyright notice and the following two paragraphs appear in all copies
013 * of this software.
014 *
015 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
016 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
017 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
018 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
019 * SUCH DAMAGE.
020 *
021 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
022 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
023 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
024 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
025 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
026 * ENHANCEMENTS, OR MODIFICATIONS.
027 *
028 */
029
030package org.kepler.util;
031
032import java.io.InputStream;
033import java.io.InputStreamReader;
034import java.util.Vector;
035
036import org.apache.commons.logging.Log;
037import org.apache.commons.logging.LogFactory;
038
039/**
040 * tokenizes a delimited file. This reader assumes that one record is on one
041 * line which ends with the line
042 */
043public class DelimitedReader {
044        private InputStreamReader dataReader;
045        private Vector[] lines;
046        private Vector linesVector;
047        private int numHeaderLines;
048        private int numRecords;
049        private boolean stripHeader = false;
050        private int numCols;
051        private String delimiter;
052        private String lineEnding;
053        private boolean collapseDilimiter = false;
054        private int numFooterLines = 0;
055        private Vector footerBuffer = new Vector();
056        private boolean initializedFooterBuffer = false;
057        private int headLineNumberCount = 0;
058        private boolean isLenient = false;
059        private String discoveredLineEnding = null;
060        private static Vector possibleLineEndings = null;
061
062        private static Log log;
063        static {
064                log = LogFactory.getLog("org.kepler.util.DelimitedReader");
065                possibleLineEndings = new Vector();
066                possibleLineEndings.add("\n");
067                possibleLineEndings.add("\r");
068                possibleLineEndings.add("\r\n");
069        }
070
071        /**
072         * constructor. reads the csv stream.
073         * 
074         * @param delimString
075         *            the delimited stream to read
076         * @param numCols
077         *            the number of columns in the stream
078         * @param delimiter
079         *            the delimiter to tokenize on
080         * @param numHeaderLines
081         *            the number of lines to skip at the top of the file
082         * @param lineEnding
083         *            the line ending char(s)...either "\n"lo (unix),
084         * @param isLenient
085         *            specifies if extra columns should be ignored "\r\n" (windoze)
086         *            or "\r" (mac)
087         */
088        public DelimitedReader(String data, int numCols, String delimiter,
089                        int numHeaderLines, String lineEnding, int numRecords,
090                        boolean isLenient) throws Exception {
091                this.numHeaderLines = numHeaderLines;
092                this.numCols = numCols;
093                this.numRecords = numRecords;
094                log.debug("Delimiter is: " + delimiter);
095                this.delimiter = unescapeDelimiter(delimiter);
096                log.debug("LineEnding is: " + lineEnding);
097                this.lineEnding = unescapeDelimiter(lineEnding);
098                this.isLenient = isLenient;
099
100                // lines = new Vector[numRecords + numHeaderLines + 1];
101                linesVector = new Vector();
102
103                int begin = 0;
104                int end = 0;
105                // int i = 0;
106                while (end < data.length()) { // add each line of the string as an
107                                                                                // element in a vector
108                        end = data.indexOf(this.lineEnding, begin); // DFH 'this.' added
109                        if (end == -1) {
110                                end = data.length();
111                        }
112                        String line = data.substring(begin, end);
113                        if (!line.trim().equals("")) {
114                                // take off the line ending
115                                // MBJ: I commented out the next line as it was improperly
116                                // truncating lines
117                                // I'm not sure why it was there in the first place, as the
118                                // previous substring
119                                // removed the delimiter
120                                // line = line.substring(0, line.length() -
121                                // lineEnding.length());
122
123                                // split the line based on the delimiter
124                                Vector v = splitDelimitedRowStringIntoVector(line);
125                                /*
126                                 * String[] s = line.split(delimiter.trim(), numCols); Vector v
127                                 * = new Vector(); for(int j=0; j<s.length; j++) {
128                                 * v.addElement(s[j]); }
129                                 * 
130                                 * if(v.size() < numCols) { int vsize = v.size(); for(int j=0;
131                                 * j<numCols - vsize; j++) { //add any elements that aren't
132                                 * there so that all the records have the //same number of cols
133                                 * v.addElement(""); } }
134                                 */
135                                // lines[i] = v;
136                                linesVector.add(v);
137                                // i++;
138                        }
139                        // go to the next line
140                        begin = end + this.lineEnding.length(); // DFH 'this.' added
141                }
142
143                int records = linesVector.size();
144                if (records != this.numRecords) {
145                        this.numRecords = records;
146                        log
147                                        .warn("Metadata disagrees with actual data. Changing number of records to: "
148                                                        + records);
149                }
150                lines = new Vector[records];
151                for (int k = 0; k < records; k++) {
152                        lines[k] = (Vector) linesVector.get(k);
153                }
154                /*
155                 * for(int j=0; j<lines.length; j++) { if(lines[j] == null) { lines[j] =
156                 * new Vector(); } }
157                 */
158
159        }
160
161        /**
162         * This constructor will read delimitered data from stream rather a string
163         * 
164         * @param dataStream
165         *            InputStream The input stream
166         * @param numCols
167         *            int the number of columns
168         * @param delimiter
169         *            String delimiter the delimiter to tokenize on
170         * @param numHeaderLines
171         *            int numHeaderLines the number of lines to skip at the top of
172         *            the file
173         * @param lineEnding
174         *            String lineEnding the line ending char(s)...either "\n"
175         *            (unix),"\r\n" (windoze) or "\r" (mac)
176         * @param numRecords
177         *            int number of rows in the input stream
178         */
179        public DelimitedReader(InputStream dataStream, int numCols,
180                        String delimiter, int numHeaderLines, String lineEnding,
181                        int numRecords, boolean stripHeader) {
182                this.dataReader = new InputStreamReader(dataStream);
183                this.numHeaderLines = numHeaderLines;
184                this.numCols = numCols;
185                this.numRecords = numRecords;
186                log.debug("Delimiter is: " + delimiter);
187                this.delimiter = unescapeDelimiter(delimiter);
188                log.debug("LineEnding is: " + lineEnding);
189                this.lineEnding = unescapeDelimiter(lineEnding);
190                this.stripHeader = stripHeader;
191
192        }
193
194        /**
195         * Method to set up data stream as source
196         * 
197         * @param dataStream
198         *            InputStream
199         */
200        public void setInputStream(InputStream dataStream) {
201                this.dataReader = new InputStreamReader(dataStream);
202        }
203
204        /**
205         * Method to set up collapseDelimiter. If it is yes, consecutive dilimiters
206         * will be consider as single dilimiter.
207         * 
208         * @param collapseDelimiter
209         */
210        public void setCollapseDelimiter(boolean collapseDelimiter) {
211                this.collapseDilimiter = collapseDelimiter;
212        }
213
214        /**
215         * Set up the footer line number.
216         * 
217         * @param numFooterLines
218         */
219        public void setNumFooterLines(int numFooterLines) {
220                this.numFooterLines = numFooterLines;
221        }
222
223        public boolean isLenient() {
224                return isLenient;
225        }
226
227        public void setLenient(boolean isLenient) {
228                this.isLenient = isLenient;
229        }
230
231        /**
232         * This method is from data source as a input stream This method will read
233         * one row from and return a data vector which element is String and the
234         * value is field data. After reach the end of stream, empty vector will be
235         * returned. So this method can be iterated by a while loop until a empty
236         * vector hited. During the iteration, every data in the stream will be
237         * pulled out.
238         * 
239         * @return Vector
240         */
241        public Vector getRowDataVectorFromStream() throws Exception {
242                // System.out.println("the numFootLines is "+numFooterLines);
243                if (!initializedFooterBuffer) {
244                        for (int i = 0; i < numFooterLines; i++) {
245                                // System.out.println("the initialize with footer lines");
246                                String rowData = readOneRowDataString();
247                                // System.out.println("the data vector in initailize is "+rowData.toString());
248                                footerBuffer.add(rowData);
249                        }
250                        // this is for no footer lines
251                        if (numFooterLines == 0) {
252                                // System.out.println("the initialize without footer lines");
253                                String rowData = readOneRowDataString();
254                                // System.out.println("The initial buffere vector is "+rowData.toString());
255                                footerBuffer.add(rowData);
256                        }
257                        initializedFooterBuffer = true;
258                }
259                String nextRowData = readOneRowDataString();
260                // System.out.println("the row string data from next row "+nextRowData.toString());
261                String oneRowDataString = null;
262                Vector oneRowDataVector = new Vector();
263
264                if (nextRowData != null) {
265                        // System.out.println("before nextRowData is empty and nextRowData is "+nextRowData.toString());
266                        oneRowDataString = (String) footerBuffer.remove(0);
267                        reIndexFooterBufferVector();
268                        footerBuffer.add(nextRowData);
269                } else if (numFooterLines == 0 && !footerBuffer.isEmpty()) {
270                        // System.out.println("find the last line in fottlines num is 0!!!!!!!!");
271                        oneRowDataString = (String) footerBuffer.remove(0);
272                }
273                // System.out.println("helere!!!");
274                if (oneRowDataString != null) {
275                        log.debug("in dataReader is not null");
276                        oneRowDataVector = splitDelimitedRowStringIntoVector(oneRowDataString);
277                }
278                // System.out.println("the row data from buffer "+oneRowDataVector.toString());
279                return oneRowDataVector;
280        }
281
282        /*
283         * This method will read a row data from vector. It discard the head lines.
284         * but it doesn't dsicard footer lines This method will be called by
285         * getRowDataVectorFromStream
286         */
287        private String readOneRowDataString() {
288                // Vector oneRowDataVector = new Vector();
289                StringBuffer rowData = new StringBuffer();
290                String rowDataString = null;
291                int singleCharactor = -2;
292
293                if (dataReader != null) {
294                        // log.debug("in dataReader is not null");
295                        try {
296                                while (singleCharactor != -1) {
297                                        // log.debug("in singleCharactor is not null");
298                                        singleCharactor = dataReader.read();
299                                        char charactor = (char) singleCharactor;
300                                        rowData.append(charactor);
301                                        // find string - line ending in the row data
302                                        boolean foundLineEnding = (rowData.indexOf(lineEnding) != -1);
303
304                                        // if we are being lenient, try some other line endings for
305                                        // parsing the data
306                                        if (!foundLineEnding && this.isLenient()) {
307                                                // have we discovered the ending already in this data?
308                                                if (this.discoveredLineEnding != null) {
309                                                        foundLineEnding = (rowData
310                                                                        .indexOf(this.discoveredLineEnding) != -1);
311                                                }
312                                                // otherwise we need to try a few of them out
313                                                else {
314                                                        for (int i = 0; i < possibleLineEndings.size(); i++) {
315                                                                String possibleLineEnding = (String) possibleLineEndings
316                                                                                .get(i);
317                                                                foundLineEnding = (rowData
318                                                                                .indexOf(possibleLineEnding) != -1);
319                                                                if (foundLineEnding) {
320                                                                        this.discoveredLineEnding = possibleLineEnding;
321                                                                        break;
322                                                                }
323                                                        }
324                                                }
325                                        }
326                                        // finally see if we found the end of the line
327                                        if (foundLineEnding) {
328                                                log.debug("found line ending");
329                                                // strip the header lines
330                                                if (stripHeader && numHeaderLines > 0
331                                                                && headLineNumberCount < numHeaderLines) {
332                                                        // reset string buffer(descard the header line)
333                                                        rowData = null;
334                                                        rowData = new StringBuffer();
335
336                                                } else {
337                                                        rowDataString = rowData.toString();
338                                                        log.debug("The row data is " + rowDataString);
339                                                        break;
340                                                }
341                                                headLineNumberCount++;
342                                        }
343                                }
344                        } catch (Exception e) {
345                                log.debug("Couldn't read data from input stream");
346                        }
347                }
348                // System.out.println("the row data before reutrn is "+rowDataString);
349                return rowDataString;
350        }
351
352        /*
353         * This method will forward one index for every element, 1 -> 0, 2->1
354         */
355        private void reIndexFooterBufferVector() {
356                for (int i = 0; i < numFooterLines - 2; i++) {
357                        Vector element = (Vector) footerBuffer.elementAt(i + 1);
358                        footerBuffer.add(i, element);
359                }
360        }
361
362        /*
363         * This method will read a delimitered string and put a delimitered part
364         * into an element in a vector. If the vector size is less than the column
365         * number empty string will be added.
366         */
367        private Vector splitDelimitedRowStringIntoVector(String data)
368                        throws Exception {
369                Vector result = new Vector();
370                if (data == null) {
371                        return result;
372                }
373                String[] s = null;
374                if (!collapseDilimiter) {
375                        s = data.split(delimiter);
376                } else {
377                        String newDelimiterWithRegExpress = delimiter + "+";
378                        s = data.split(newDelimiterWithRegExpress);
379
380                }
381
382                if (s != null) {
383                        if (!isLenient && s.length > numCols) {
384                                throw new Exception("Metadata sees data has " + numCols
385                                                + " columns but actually data has " + s.length
386                                                + " columns. Please make sure metadata is correct!");
387                        }
388                        int columnCount = Math.min(s.length, numCols);
389                        for (int j = 0; j < columnCount; j++) {
390
391                                if (s[j] != null) {
392                                        result.addElement(s[j].trim());
393                                } else {
394                                        result.addElement("");
395                                }
396                        }
397                        // add any elements that aren't there so that all the records have
398                        // the
399                        // same number of cols
400                        if (result.size() < numCols) {
401                                int vsize = result.size();
402                                for (int j = 0; j < numCols - vsize; j++) {
403                                        result.addElement("");
404                                }
405                        }
406                }
407                return result;
408        }
409
410        /**
411         * returns the data as an array of vectors. each vector will have the same
412         * number of elements as there are columns in the data.
413         * 
414         * @param stripHeaderLines
415         *            true if the header lines should not be included in the
416         *            returned data, false otherwise
417         */
418        public Vector[] getTokenizedData(boolean stripHeaderLines) {
419                if (stripHeaderLines) {
420                        Vector[] strip = null;
421                        if (numRecords > numHeaderLines) {
422                                strip = new Vector[numRecords - numHeaderLines];
423                                for (int i = numHeaderLines; i < lines.length; i++) {
424                                        strip[i - numHeaderLines] = lines[i];
425                                }
426                        }
427                        return strip;
428                } else {
429                        return lines;
430                }
431        }
432
433        /**
434         * returns a string representation of the data
435         */
436        public String toString() {
437                StringBuffer sb = new StringBuffer();
438                for (int i = 0; i < lines.length; i++) {
439                        log.debug("line[" + (i + 1) + "]: " + lines[i].toString());
440                        for (int j = 0; j < lines[i].size(); j++) {
441                                sb.append((String) lines[i].elementAt(j));
442                                if (j != lines[i].size() - 1) {
443                                        sb.append(" || ");
444                                }
445                        }
446                        sb.append(lineEnding);
447                }
448                return sb.toString();
449        }
450
451        /**
452         * Convert a string escaped representation of a delimiter character into an
453         * the actual String for that delimiter. This is used for translating
454         * escaped versions of tab, newline, and carriage return characters to their
455         * real character values.
456         * 
457         * @param delimiter
458         *            the String representing the delimiter
459         * @return the actual String for the delimiter
460         */
461        public static String unescapeDelimiter(String delimiter) {
462                String newDelimiter = delimiter;
463
464                if (delimiter == null) {
465                        log.debug("Delimiter is null and we set up to \n.");
466                        newDelimiter = "\n";
467                } else if (delimiter.equals("\\t")) {
468                        log.debug("Tab interpreted incorrectly as string.");
469                        newDelimiter = "\t";
470                } else if (delimiter.equals("\\n")) {
471                        log.debug("Newline interpreted incorrectly as string.");
472                        newDelimiter = "\n";
473                } else if (delimiter.equals("\\r")) {
474                        log.debug("CR interpreted incorrectly as string.");
475                        newDelimiter = "\r";
476                } else if (delimiter.equals("\\r\\n")) {
477                        log.debug("CRNL interpreted incorrectly as string.");
478                        newDelimiter = "\r\n";
479                } else if (delimiter.startsWith("#")) {
480                        log.debug("XML entity charactor.");
481                        String digits = delimiter.substring(1, delimiter.length());
482                        int radix = 10;
483                        if (digits.startsWith("x")) {
484                                log.debug("Radix is " + 16);
485                                radix = 16;
486                                digits = digits.substring(1, digits.length());
487                        }
488                        log.debug("Int value of  delimiter is " + digits);
489
490                        newDelimiter = transferDigitsToCharString(radix, digits);
491
492                } else if (delimiter.startsWith("0x") || delimiter.startsWith("0X")) {
493                        int radix = 16;
494                        String digits = delimiter.substring(2, delimiter.length());
495                        log.debug("Int value of  delimiter is " + digits);
496                        newDelimiter = transferDigitsToCharString(radix, digits);
497                }
498
499                return newDelimiter;
500        }
501
502        private static String transferDigitsToCharString(int radix, String digits) {
503                if (digits == null) {
504                        return null;
505                }
506                Integer integer = Integer.valueOf(digits, radix);
507                int inter = integer.intValue();
508                log.debug("The decimal value of char is " + inter);
509                char charactor = (char) inter;
510                String newDelimiter = Character.toString(charactor);
511                log.debug("The new delimter is " + newDelimiter);
512                return newDelimiter;
513        }
514}