001/* 002 * Copyright (c) 2003-2010 The Regents of the University of California. 003 * All rights reserved. 004 * 005 * '$Author: welker $' 006 * '$Date: 2010-05-06 05:21:26 +0000 (Thu, 06 May 2010) $' 007 * '$Revision: 24234 $' 008 * 009 * Permission is hereby granted, without written agreement and without 010 * license or royalty fees, to use, copy, modify, and distribute this 011 * software and its documentation for any purpose, provided that the above 012 * copyright notice and the following two paragraphs appear in all copies 013 * of this software. 014 * 015 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY 016 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 017 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 018 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF 019 * SUCH DAMAGE. 020 * 021 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 022 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 023 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE 024 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF 025 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 026 * ENHANCEMENTS, OR MODIFICATIONS. 027 * 028 */ 029 030package org.kepler.util; 031 032import java.io.InputStream; 033import java.io.InputStreamReader; 034import java.util.Vector; 035 036import org.apache.commons.logging.Log; 037import org.apache.commons.logging.LogFactory; 038 039/** 040 * tokenizes a delimited file. This reader assumes that one record is on one 041 * line which ends with the line 042 */ 043public class DelimitedReader { 044 private InputStreamReader dataReader; 045 private Vector[] lines; 046 private Vector linesVector; 047 private int numHeaderLines; 048 private int numRecords; 049 private boolean stripHeader = false; 050 private int numCols; 051 private String delimiter; 052 private String lineEnding; 053 private boolean collapseDilimiter = false; 054 private int numFooterLines = 0; 055 private Vector footerBuffer = new Vector(); 056 private boolean initializedFooterBuffer = false; 057 private int headLineNumberCount = 0; 058 private boolean isLenient = false; 059 private String discoveredLineEnding = null; 060 private static Vector possibleLineEndings = null; 061 062 private static Log log; 063 static { 064 log = LogFactory.getLog("org.kepler.util.DelimitedReader"); 065 possibleLineEndings = new Vector(); 066 possibleLineEndings.add("\n"); 067 possibleLineEndings.add("\r"); 068 possibleLineEndings.add("\r\n"); 069 } 070 071 /** 072 * constructor. reads the csv stream. 073 * 074 * @param delimString 075 * the delimited stream to read 076 * @param numCols 077 * the number of columns in the stream 078 * @param delimiter 079 * the delimiter to tokenize on 080 * @param numHeaderLines 081 * the number of lines to skip at the top of the file 082 * @param lineEnding 083 * the line ending char(s)...either "\n"lo (unix), 084 * @param isLenient 085 * specifies if extra columns should be ignored "\r\n" (windoze) 086 * or "\r" (mac) 087 */ 088 public DelimitedReader(String data, int numCols, String delimiter, 089 int numHeaderLines, String lineEnding, int numRecords, 090 boolean isLenient) throws Exception { 091 this.numHeaderLines = numHeaderLines; 092 this.numCols = numCols; 093 this.numRecords = numRecords; 094 log.debug("Delimiter is: " + delimiter); 095 this.delimiter = unescapeDelimiter(delimiter); 096 log.debug("LineEnding is: " + lineEnding); 097 this.lineEnding = unescapeDelimiter(lineEnding); 098 this.isLenient = isLenient; 099 100 // lines = new Vector[numRecords + numHeaderLines + 1]; 101 linesVector = new Vector(); 102 103 int begin = 0; 104 int end = 0; 105 // int i = 0; 106 while (end < data.length()) { // add each line of the string as an 107 // element in a vector 108 end = data.indexOf(this.lineEnding, begin); // DFH 'this.' added 109 if (end == -1) { 110 end = data.length(); 111 } 112 String line = data.substring(begin, end); 113 if (!line.trim().equals("")) { 114 // take off the line ending 115 // MBJ: I commented out the next line as it was improperly 116 // truncating lines 117 // I'm not sure why it was there in the first place, as the 118 // previous substring 119 // removed the delimiter 120 // line = line.substring(0, line.length() - 121 // lineEnding.length()); 122 123 // split the line based on the delimiter 124 Vector v = splitDelimitedRowStringIntoVector(line); 125 /* 126 * String[] s = line.split(delimiter.trim(), numCols); Vector v 127 * = new Vector(); for(int j=0; j<s.length; j++) { 128 * v.addElement(s[j]); } 129 * 130 * if(v.size() < numCols) { int vsize = v.size(); for(int j=0; 131 * j<numCols - vsize; j++) { //add any elements that aren't 132 * there so that all the records have the //same number of cols 133 * v.addElement(""); } } 134 */ 135 // lines[i] = v; 136 linesVector.add(v); 137 // i++; 138 } 139 // go to the next line 140 begin = end + this.lineEnding.length(); // DFH 'this.' added 141 } 142 143 int records = linesVector.size(); 144 if (records != this.numRecords) { 145 this.numRecords = records; 146 log 147 .warn("Metadata disagrees with actual data. Changing number of records to: " 148 + records); 149 } 150 lines = new Vector[records]; 151 for (int k = 0; k < records; k++) { 152 lines[k] = (Vector) linesVector.get(k); 153 } 154 /* 155 * for(int j=0; j<lines.length; j++) { if(lines[j] == null) { lines[j] = 156 * new Vector(); } } 157 */ 158 159 } 160 161 /** 162 * This constructor will read delimitered data from stream rather a string 163 * 164 * @param dataStream 165 * InputStream The input stream 166 * @param numCols 167 * int the number of columns 168 * @param delimiter 169 * String delimiter the delimiter to tokenize on 170 * @param numHeaderLines 171 * int numHeaderLines the number of lines to skip at the top of 172 * the file 173 * @param lineEnding 174 * String lineEnding the line ending char(s)...either "\n" 175 * (unix),"\r\n" (windoze) or "\r" (mac) 176 * @param numRecords 177 * int number of rows in the input stream 178 */ 179 public DelimitedReader(InputStream dataStream, int numCols, 180 String delimiter, int numHeaderLines, String lineEnding, 181 int numRecords, boolean stripHeader) { 182 this.dataReader = new InputStreamReader(dataStream); 183 this.numHeaderLines = numHeaderLines; 184 this.numCols = numCols; 185 this.numRecords = numRecords; 186 log.debug("Delimiter is: " + delimiter); 187 this.delimiter = unescapeDelimiter(delimiter); 188 log.debug("LineEnding is: " + lineEnding); 189 this.lineEnding = unescapeDelimiter(lineEnding); 190 this.stripHeader = stripHeader; 191 192 } 193 194 /** 195 * Method to set up data stream as source 196 * 197 * @param dataStream 198 * InputStream 199 */ 200 public void setInputStream(InputStream dataStream) { 201 this.dataReader = new InputStreamReader(dataStream); 202 } 203 204 /** 205 * Method to set up collapseDelimiter. If it is yes, consecutive dilimiters 206 * will be consider as single dilimiter. 207 * 208 * @param collapseDelimiter 209 */ 210 public void setCollapseDelimiter(boolean collapseDelimiter) { 211 this.collapseDilimiter = collapseDelimiter; 212 } 213 214 /** 215 * Set up the footer line number. 216 * 217 * @param numFooterLines 218 */ 219 public void setNumFooterLines(int numFooterLines) { 220 this.numFooterLines = numFooterLines; 221 } 222 223 public boolean isLenient() { 224 return isLenient; 225 } 226 227 public void setLenient(boolean isLenient) { 228 this.isLenient = isLenient; 229 } 230 231 /** 232 * This method is from data source as a input stream This method will read 233 * one row from and return a data vector which element is String and the 234 * value is field data. After reach the end of stream, empty vector will be 235 * returned. So this method can be iterated by a while loop until a empty 236 * vector hited. During the iteration, every data in the stream will be 237 * pulled out. 238 * 239 * @return Vector 240 */ 241 public Vector getRowDataVectorFromStream() throws Exception { 242 // System.out.println("the numFootLines is "+numFooterLines); 243 if (!initializedFooterBuffer) { 244 for (int i = 0; i < numFooterLines; i++) { 245 // System.out.println("the initialize with footer lines"); 246 String rowData = readOneRowDataString(); 247 // System.out.println("the data vector in initailize is "+rowData.toString()); 248 footerBuffer.add(rowData); 249 } 250 // this is for no footer lines 251 if (numFooterLines == 0) { 252 // System.out.println("the initialize without footer lines"); 253 String rowData = readOneRowDataString(); 254 // System.out.println("The initial buffere vector is "+rowData.toString()); 255 footerBuffer.add(rowData); 256 } 257 initializedFooterBuffer = true; 258 } 259 String nextRowData = readOneRowDataString(); 260 // System.out.println("the row string data from next row "+nextRowData.toString()); 261 String oneRowDataString = null; 262 Vector oneRowDataVector = new Vector(); 263 264 if (nextRowData != null) { 265 // System.out.println("before nextRowData is empty and nextRowData is "+nextRowData.toString()); 266 oneRowDataString = (String) footerBuffer.remove(0); 267 reIndexFooterBufferVector(); 268 footerBuffer.add(nextRowData); 269 } else if (numFooterLines == 0 && !footerBuffer.isEmpty()) { 270 // System.out.println("find the last line in fottlines num is 0!!!!!!!!"); 271 oneRowDataString = (String) footerBuffer.remove(0); 272 } 273 // System.out.println("helere!!!"); 274 if (oneRowDataString != null) { 275 log.debug("in dataReader is not null"); 276 oneRowDataVector = splitDelimitedRowStringIntoVector(oneRowDataString); 277 } 278 // System.out.println("the row data from buffer "+oneRowDataVector.toString()); 279 return oneRowDataVector; 280 } 281 282 /* 283 * This method will read a row data from vector. It discard the head lines. 284 * but it doesn't dsicard footer lines This method will be called by 285 * getRowDataVectorFromStream 286 */ 287 private String readOneRowDataString() { 288 // Vector oneRowDataVector = new Vector(); 289 StringBuffer rowData = new StringBuffer(); 290 String rowDataString = null; 291 int singleCharactor = -2; 292 293 if (dataReader != null) { 294 // log.debug("in dataReader is not null"); 295 try { 296 while (singleCharactor != -1) { 297 // log.debug("in singleCharactor is not null"); 298 singleCharactor = dataReader.read(); 299 char charactor = (char) singleCharactor; 300 rowData.append(charactor); 301 // find string - line ending in the row data 302 boolean foundLineEnding = (rowData.indexOf(lineEnding) != -1); 303 304 // if we are being lenient, try some other line endings for 305 // parsing the data 306 if (!foundLineEnding && this.isLenient()) { 307 // have we discovered the ending already in this data? 308 if (this.discoveredLineEnding != null) { 309 foundLineEnding = (rowData 310 .indexOf(this.discoveredLineEnding) != -1); 311 } 312 // otherwise we need to try a few of them out 313 else { 314 for (int i = 0; i < possibleLineEndings.size(); i++) { 315 String possibleLineEnding = (String) possibleLineEndings 316 .get(i); 317 foundLineEnding = (rowData 318 .indexOf(possibleLineEnding) != -1); 319 if (foundLineEnding) { 320 this.discoveredLineEnding = possibleLineEnding; 321 break; 322 } 323 } 324 } 325 } 326 // finally see if we found the end of the line 327 if (foundLineEnding) { 328 log.debug("found line ending"); 329 // strip the header lines 330 if (stripHeader && numHeaderLines > 0 331 && headLineNumberCount < numHeaderLines) { 332 // reset string buffer(descard the header line) 333 rowData = null; 334 rowData = new StringBuffer(); 335 336 } else { 337 rowDataString = rowData.toString(); 338 log.debug("The row data is " + rowDataString); 339 break; 340 } 341 headLineNumberCount++; 342 } 343 } 344 } catch (Exception e) { 345 log.debug("Couldn't read data from input stream"); 346 } 347 } 348 // System.out.println("the row data before reutrn is "+rowDataString); 349 return rowDataString; 350 } 351 352 /* 353 * This method will forward one index for every element, 1 -> 0, 2->1 354 */ 355 private void reIndexFooterBufferVector() { 356 for (int i = 0; i < numFooterLines - 2; i++) { 357 Vector element = (Vector) footerBuffer.elementAt(i + 1); 358 footerBuffer.add(i, element); 359 } 360 } 361 362 /* 363 * This method will read a delimitered string and put a delimitered part 364 * into an element in a vector. If the vector size is less than the column 365 * number empty string will be added. 366 */ 367 private Vector splitDelimitedRowStringIntoVector(String data) 368 throws Exception { 369 Vector result = new Vector(); 370 if (data == null) { 371 return result; 372 } 373 String[] s = null; 374 if (!collapseDilimiter) { 375 s = data.split(delimiter); 376 } else { 377 String newDelimiterWithRegExpress = delimiter + "+"; 378 s = data.split(newDelimiterWithRegExpress); 379 380 } 381 382 if (s != null) { 383 if (!isLenient && s.length > numCols) { 384 throw new Exception("Metadata sees data has " + numCols 385 + " columns but actually data has " + s.length 386 + " columns. Please make sure metadata is correct!"); 387 } 388 int columnCount = Math.min(s.length, numCols); 389 for (int j = 0; j < columnCount; j++) { 390 391 if (s[j] != null) { 392 result.addElement(s[j].trim()); 393 } else { 394 result.addElement(""); 395 } 396 } 397 // add any elements that aren't there so that all the records have 398 // the 399 // same number of cols 400 if (result.size() < numCols) { 401 int vsize = result.size(); 402 for (int j = 0; j < numCols - vsize; j++) { 403 result.addElement(""); 404 } 405 } 406 } 407 return result; 408 } 409 410 /** 411 * returns the data as an array of vectors. each vector will have the same 412 * number of elements as there are columns in the data. 413 * 414 * @param stripHeaderLines 415 * true if the header lines should not be included in the 416 * returned data, false otherwise 417 */ 418 public Vector[] getTokenizedData(boolean stripHeaderLines) { 419 if (stripHeaderLines) { 420 Vector[] strip = null; 421 if (numRecords > numHeaderLines) { 422 strip = new Vector[numRecords - numHeaderLines]; 423 for (int i = numHeaderLines; i < lines.length; i++) { 424 strip[i - numHeaderLines] = lines[i]; 425 } 426 } 427 return strip; 428 } else { 429 return lines; 430 } 431 } 432 433 /** 434 * returns a string representation of the data 435 */ 436 public String toString() { 437 StringBuffer sb = new StringBuffer(); 438 for (int i = 0; i < lines.length; i++) { 439 log.debug("line[" + (i + 1) + "]: " + lines[i].toString()); 440 for (int j = 0; j < lines[i].size(); j++) { 441 sb.append((String) lines[i].elementAt(j)); 442 if (j != lines[i].size() - 1) { 443 sb.append(" || "); 444 } 445 } 446 sb.append(lineEnding); 447 } 448 return sb.toString(); 449 } 450 451 /** 452 * Convert a string escaped representation of a delimiter character into an 453 * the actual String for that delimiter. This is used for translating 454 * escaped versions of tab, newline, and carriage return characters to their 455 * real character values. 456 * 457 * @param delimiter 458 * the String representing the delimiter 459 * @return the actual String for the delimiter 460 */ 461 public static String unescapeDelimiter(String delimiter) { 462 String newDelimiter = delimiter; 463 464 if (delimiter == null) { 465 log.debug("Delimiter is null and we set up to \n."); 466 newDelimiter = "\n"; 467 } else if (delimiter.equals("\\t")) { 468 log.debug("Tab interpreted incorrectly as string."); 469 newDelimiter = "\t"; 470 } else if (delimiter.equals("\\n")) { 471 log.debug("Newline interpreted incorrectly as string."); 472 newDelimiter = "\n"; 473 } else if (delimiter.equals("\\r")) { 474 log.debug("CR interpreted incorrectly as string."); 475 newDelimiter = "\r"; 476 } else if (delimiter.equals("\\r\\n")) { 477 log.debug("CRNL interpreted incorrectly as string."); 478 newDelimiter = "\r\n"; 479 } else if (delimiter.startsWith("#")) { 480 log.debug("XML entity charactor."); 481 String digits = delimiter.substring(1, delimiter.length()); 482 int radix = 10; 483 if (digits.startsWith("x")) { 484 log.debug("Radix is " + 16); 485 radix = 16; 486 digits = digits.substring(1, digits.length()); 487 } 488 log.debug("Int value of delimiter is " + digits); 489 490 newDelimiter = transferDigitsToCharString(radix, digits); 491 492 } else if (delimiter.startsWith("0x") || delimiter.startsWith("0X")) { 493 int radix = 16; 494 String digits = delimiter.substring(2, delimiter.length()); 495 log.debug("Int value of delimiter is " + digits); 496 newDelimiter = transferDigitsToCharString(radix, digits); 497 } 498 499 return newDelimiter; 500 } 501 502 private static String transferDigitsToCharString(int radix, String digits) { 503 if (digits == null) { 504 return null; 505 } 506 Integer integer = Integer.valueOf(digits, radix); 507 int inter = integer.intValue(); 508 log.debug("The decimal value of char is " + inter); 509 char charactor = (char) inter; 510 String newDelimiter = Character.toString(charactor); 511 log.debug("The new delimter is " + newDelimiter); 512 return newDelimiter; 513 } 514}