001/* 002 * Copyright (c) 2003-2010 The Regents of the University of California. 003 * All rights reserved. 004 * 005 * '$Author: welker $' 006 * '$Date: 2010-05-06 05:21:26 +0000 (Thu, 06 May 2010) $' 007 * '$Revision: 24234 $' 008 * 009 * Permission is hereby granted, without written agreement and without 010 * license or royalty fees, to use, copy, modify, and distribute this 011 * software and its documentation for any purpose, provided that the above 012 * copyright notice and the following two paragraphs appear in all copies 013 * of this software. 014 * 015 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY 016 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 017 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 018 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF 019 * SUCH DAMAGE. 020 * 021 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 022 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 023 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE 024 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF 025 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 026 * ENHANCEMENTS, OR MODIFICATIONS. 027 * 028 */ 029 030package org.kepler.objectmanager.data.text; 031 032import java.io.InputStream; 033import java.util.Vector; 034 035import org.apache.commons.logging.Log; 036import org.apache.commons.logging.LogFactory; 037import org.kepler.objectmanager.data.db.Attribute; 038import org.kepler.objectmanager.data.db.Entity; 039import org.kepler.util.DelimitedReader; 040 041/** 042 * @author tao This class will read a data inputstream and split them into a row 043 * vectors base on the given ComplexDataFormat array. This class have a 044 * public method - getRowVector.After reach the end of stream, empty 045 * vector will be returned. So this method can be iterated by a while 046 * loop until a empty vector hited. During the iteration, every data in 047 * the stream will be pulled out. 048 */ 049public class TextComplexFormatDataReader { 050 private InputStream dataStream = null; 051 private Entity entity = null; 052 private boolean stripHeader = true; 053 private int numberOfAttirbute = 0; 054 private TextComplexDataFormat[] formats = null; 055 private String physicalLineDelimiter = null; 056 private int numberOfHeaderLines = 0; 057 private int physicalLineDelimiterLength = 0; 058 private int headLineNumberCount = 0; 059 060 private static Log log; 061 062 static { 063 log = LogFactory 064 .getLog("org.kepler.objectmanager.data.text.TextComplexFormatDataReader"); 065 } 066 067 // constants 068 public static final String DEFAULTVALUE = ""; 069 070 /** 071 * Consturctor with default stripHeader value - true 072 * 073 * @param dataStream 074 * @param entity 075 */ 076 public TextComplexFormatDataReader(InputStream dataStream, Entity entity) 077 throws Exception { 078 this(dataStream, entity, true); 079 } 080 081 /** 082 * Constructor with assigned stripHeader value 083 * 084 * @param dataStream 085 * the data input stream 086 * @param enity 087 * the entity metadata to describe the data stream 088 * @param stripHeader 089 * if strip header when we hand read the input stream 090 */ 091 public TextComplexFormatDataReader(InputStream dataStream, Entity entity, 092 boolean stripHeader) throws Exception { 093 if (dataStream == null || entity == null) { 094 throw new Exception("Data inputstream or entity metadata is null"); 095 } 096 this.dataStream = dataStream; 097 this.entity = entity; 098 this.stripHeader = stripHeader; 099 getParameterFromEntity(); 100 101 } 102 103 /* 104 * Method to set up other parameter will be used in the reader. Sucha as 105 * numberOfArributes, physicalLineDelimiter. 106 */ 107 private void getParameterFromEntity() throws Exception { 108 Attribute[] attributeList = entity.getAttributes(); 109 if (attributeList == null) { 110 throw new Exception("Attribute in entity metadata is null"); 111 } else { 112 numberOfAttirbute = attributeList.length; 113 } 114 115 numberOfHeaderLines = entity.getNumHeaderLines(); 116 if (numberOfHeaderLines == -1) { 117 numberOfHeaderLines = 0; 118 } 119 120 // physicalLineDelmiter will get from physicalDelimiter elements 121 // if no physicalDelimiter element, we will try record delimter 122 physicalLineDelimiter = entity.getPhysicalLineDelimiter(); 123 if (physicalLineDelimiter == null) { 124 physicalLineDelimiter = entity.getRecordDelimiter(); 125 } 126 physicalLineDelimiter = DelimitedReader 127 .unescapeDelimiter(physicalLineDelimiter); 128 physicalLineDelimiterLength = physicalLineDelimiter.length(); 129 130 formats = entity.getDataFormatArray(); 131 if (formats == null) { 132 throw new Exception("Complext format is null in metadata entity"); 133 } else { 134 int length = formats.length; 135 if (length != numberOfAttirbute) { 136 throw new Exception( 137 "Complex formats should have same number as attribute number"); 138 } 139 } 140 } 141 142 /** 143 * This method will read one row from inputstream and return a data vector 144 * which element is String and the value is field data. After reach the end 145 * of stream, empty vector will be returned. So this method can be iterated 146 * by a while loop until a empty vector hited. During the iteration, every 147 * data in the stream will be pulled out. 148 * 149 * @return Vector 150 */ 151 public Vector getRowDataVectorFromStream() throws Exception { 152 Vector oneRowDataVector = new Vector(); 153 StringBuffer lineDelimiterBuffer = new StringBuffer();// to store 154 // delmiter 155 StringBuffer fieldValueBuffer = new StringBuffer(); 156 int singleCharactor = -2; 157 int columnCount = 1;// this is for every character in one row 158 int attributeCount = 0; // this is for every attribute 159 boolean startNewAttribute = true; 160 boolean isWidthFix = true; 161 int width = -1; 162 int widthCount = 0; 163 boolean startWidthCount = false; 164 int startColumnNumberFromFormat = -1; 165 String fieldDelimiter = null; 166 167 if (dataStream != null) { 168 singleCharactor = dataStream.read(); 169 170 while (singleCharactor != -1) { 171 char charactor = (char) singleCharactor; 172 // strip header 173 if (stripHeader && numberOfHeaderLines > 0 174 && headLineNumberCount < numberOfHeaderLines) { 175 lineDelimiterBuffer.append(charactor); 176 if (lineDelimiterBuffer.length() == physicalLineDelimiterLength 177 && lineDelimiterBuffer.toString().equals( 178 physicalLineDelimiter)) { 179 // reset the delimiter buffer 180 lineDelimiterBuffer = new StringBuffer(); 181 headLineNumberCount++; 182 } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) { 183 // reset the delimiter buffer 184 lineDelimiterBuffer = new StringBuffer(); 185 } 186 187 } else { 188 // handle data after strip header 189 fieldValueBuffer.append(charactor); 190 lineDelimiterBuffer.append(charactor); 191 192 // set up format info 193 if (startNewAttribute) { 194 startNewAttribute = false; 195 // find the format from array 196 TextComplexDataFormat format = formats[attributeCount]; 197 if (format == null) { 198 throw new Exception( 199 "The text format is null for an attribute"); 200 } else if (format instanceof TextWidthFixedDataFormat) { 201 TextWidthFixedDataFormat widthFormat = (TextWidthFixedDataFormat) format; 202 width = widthFormat.getFieldWidth(); 203 startColumnNumberFromFormat = widthFormat 204 .getFieldStartColumn(); 205 isWidthFix = true; 206 startWidthCount = false; 207 208 } else if (format instanceof TextDelimitedDataFormat) { 209 TextDelimitedDataFormat delimitedFormat = (TextDelimitedDataFormat) format; 210 fieldDelimiter = delimitedFormat 211 .getFieldDelimiter(); 212 isWidthFix = false; 213 } 214 } 215 216 if (isWidthFix) { 217 // find start cloumn if metadata specify it 218 if (startColumnNumberFromFormat != -1 219 && startColumnNumberFromFormat == columnCount) { 220 fieldValueBuffer = new StringBuffer(); 221 fieldValueBuffer.append(charactor); 222 startWidthCount = true; 223 } else if (startColumnNumberFromFormat == -1) { 224 startWidthCount = true; 225 } 226 // start count width 227 if (startWidthCount) { 228 widthCount++; 229 } 230 // we got the value when widthcount reach width of this 231 // format 232 if (widthCount == width) { 233 String value = fieldValueBuffer.toString(); 234 log.debug("Add width fixed attribute value " 235 + value + " to the vector"); 236 oneRowDataVector.add(value.trim()); 237 widthCount = 0; 238 startWidthCount = false; 239 fieldValueBuffer = new StringBuffer(); 240 startNewAttribute = true; 241 attributeCount++; 242 } 243 244 } else { 245 // for delimter data 246 if (fieldValueBuffer.toString() 247 .endsWith(fieldDelimiter)) { 248 String value = fieldValueBuffer.toString(); 249 value = value.substring(0, value.length() 250 - fieldDelimiter.length()); 251 log.debug("Add delimited attribute value " + value 252 + " to the vector"); 253 oneRowDataVector.add(value.trim()); 254 fieldValueBuffer = new StringBuffer(); 255 startNewAttribute = true; 256 attributeCount++; 257 } 258 } 259 260 columnCount++; 261 262 // reset columnCount to 1 when hit a physical line delimiter 263 if (lineDelimiterBuffer.length() == physicalLineDelimiterLength 264 && lineDelimiterBuffer.toString().equals( 265 physicalLineDelimiter)) { 266 // reset the delimiter buffer 267 lineDelimiterBuffer = new StringBuffer(); 268 columnCount = 1; 269 } else if (lineDelimiterBuffer.length() == physicalLineDelimiterLength) { 270 // reset the delimiter buffer 271 lineDelimiterBuffer = new StringBuffer(); 272 } 273 274 // get a row vector break it. 275 if (attributeCount == numberOfAttirbute) { 276 break; 277 } 278 } 279 singleCharactor = dataStream.read(); 280 } 281 282 } 283 // if row vector is not empty and its length less than number of 284 // attribute, 285 // we should add "" string to make its' length equals attribute length; 286 if (!oneRowDataVector.isEmpty() 287 && oneRowDataVector.size() < numberOfAttirbute) { 288 int size = oneRowDataVector.size(); 289 for (int i = size; i < numberOfAttirbute; i++) { 290 oneRowDataVector.add(DEFAULTVALUE); 291 } 292 } 293 return oneRowDataVector; 294 } 295 296 /** 297 * @return Returns the dataStream. 298 */ 299 public InputStream getDataStream() { 300 return dataStream; 301 } 302 303 /** 304 * @param dataStream 305 * The dataStream to set. 306 */ 307 public void setDataStream(InputStream dataStream) { 308 this.dataStream = dataStream; 309 } 310 311 /** 312 * @return Returns the entity. 313 */ 314 public Entity getEntity() { 315 return entity; 316 } 317 318 /** 319 * @param entity 320 * The entity to set. 321 */ 322 public void setEntity(Entity entity) { 323 this.entity = entity; 324 } 325}