001/* 002 * Copyright (c) 2003-2010 The Regents of the University of California. 003 * All rights reserved. 004 * 005 * '$Author: crawl $' 006 * '$Date: 2012-11-26 22:19:36 +0000 (Mon, 26 Nov 2012) $' 007 * '$Revision: 31113 $' 008 * 009 * Permission is hereby granted, without written agreement and without 010 * license or royalty fees, to use, copy, modify, and distribute this 011 * software and its documentation for any purpose, provided that the above 012 * copyright notice and the following two paragraphs appear in all copies 013 * of this software. 014 * 015 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY 016 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 017 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF 018 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF 019 * SUCH DAMAGE. 020 * 021 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, 022 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 023 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE 024 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF 025 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 026 * ENHANCEMENTS, OR MODIFICATIONS. 027 * 028 */ 029 030package org.ecoinformatics.seek.datasource.eml.eml2; 031 032import java.io.InputStream; 033import java.util.Hashtable; 034import java.util.List; 035import java.util.Vector; 036 037import javax.xml.parsers.DocumentBuilder; 038import javax.xml.parsers.DocumentBuilderFactory; 039 040import org.apache.commons.logging.Log; 041import org.apache.commons.logging.LogFactory; 042import org.apache.xpath.CachedXPathAPI; 043import org.kepler.metadata.ParserInterface; 044import org.kepler.objectmanager.data.DataType; 045import org.kepler.objectmanager.data.DataTypeResolver; 046import org.kepler.objectmanager.data.DateTimeDomain; 047import org.kepler.objectmanager.data.Domain; 048import org.kepler.objectmanager.data.EnumeratedDomain; 049import org.kepler.objectmanager.data.NumericDomain; 050import org.kepler.objectmanager.data.db.Attribute; 051import org.kepler.objectmanager.data.db.AttributeList; 052import org.kepler.objectmanager.data.db.Entity; 053import org.kepler.objectmanager.data.text.TextComplexDataFormat; 054import org.kepler.objectmanager.data.text.TextDelimitedDataFormat; 055import org.kepler.objectmanager.data.text.TextDomain; 056import org.kepler.objectmanager.data.text.TextWidthFixedDataFormat; 057import org.w3c.dom.Document; 058import org.w3c.dom.Element; 059import org.w3c.dom.NamedNodeMap; 060import org.w3c.dom.Node; 061import org.w3c.dom.NodeList; 062import org.xml.sax.InputSource; 063import org.xml.sax.SAXException; 064 065/** 066 * This plugin parses EML 2.0.0 metadata files 067 */ 068public class Eml200Parser implements ParserInterface { 069 070 //private static String NAMESPACE = "eml://ecoinformatics.org/eml-2.0.0"; 071 private String nameSpace = null; 072 private Hashtable<String, Entity> entityHash = new Hashtable<String, Entity>(); 073 private Vector<Entity> entityList = new Vector<Entity>();//this one will preserve the order of the entity. 074 // private Hashtable fileHash = new Hashtable(); 075 private int numEntities = 0; 076 private int numRecords = -1; 077 private Entity entityObject = null; 078 private DataTypeResolver dtr = DataTypeResolver.instanceOf(); 079 private int elementId = 0; 080 // private boolean hasImageEntity = false; 081 private int complexFormatsNumber = 0; 082 private Hashtable<String, AttributeList> attributeListHash = new Hashtable<String, AttributeList>(); 083 private boolean hasMissingValue = false; 084 085 private static Log log; 086 private static boolean isDebugging; 087 088 static { 089 log = LogFactory 090 .getLog("org.ecoinformatics.seek.datasource.eml.eml2.Eml200Parser"); 091 isDebugging = log.isDebugEnabled(); 092 } 093 094 // constants 095 public static final String TABLEENTITY = "//dataset/dataTable"; 096 public static final String SPATIALRASTERENTITY = "//dataset/spatialRaster"; 097 public static final String SPATIALVECTORENTITY = "//dataset/spatialVector"; 098 public static final String STOREDPROCEDUREENTITY = "//dataset/storedProcedure"; 099 public static final String VIEWENTITY = "//dataset/view"; 100 public static final String OTHERENTITY = "//dataset/otherEntity"; 101 public static final String EML = "eml"; 102 public static final String PACKAGEID="packageId"; 103 private static final String INFORMATION = "information"; 104 105 /** 106 * returns a hashtable of with the id of the entity as the key and the data 107 * file id to which the entity refers as the value. This way, if you want to 108 * know what data file goes with an entity, you can do a get on this hash 109 * for the id of the entity. note that the entity id is the xml entity id 110 * from the generated input step, not the id of the entity file itself. 111 */ 112 /* 113 * public Hashtable getDataFilesHash() { return fileHash; } 114 */ 115 116 /** 117 * parses the EML package using an InputSource 118 */ 119 public void parse(InputSource source) throws Exception { 120 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 121 factory.setNamespaceAware(true); 122 DocumentBuilder builder = factory.newDocumentBuilder(); 123 Document doc = builder.parse(source); 124 parseDocument(doc); 125 } 126 127 /** 128 * parses the EML package using an InputStream 129 */ 130 public void parse(InputStream is) throws Exception { 131 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 132 factory.setNamespaceAware(true); 133 DocumentBuilder builder = factory.newDocumentBuilder(); 134 Document doc = builder.parse(is); 135 parseDocument(doc); 136 } 137 138 /* 139 * parses the EML document. Now except dataTable, spatialRaster and 140 * spatialVector entities are added. 141 */ 142 private void parseDocument(Document doc) throws Exception { 143 NodeList entities; 144 NodeList spatialRasterEntities; 145 NodeList spatialVectorEntities; 146 NodeList otherEntities; 147 NodeList viewEntities; 148 Element root = doc.getDocumentElement(); 149 nameSpace = root.getNamespaceURI(); 150 //System.out.println("name space is ==== in the document "+nameSpace); 151 CachedXPathAPI xpathapi = new CachedXPathAPI(); 152 try { 153 // now dataTable, spatialRaster and spatialVector are handled 154 entities = xpathapi.selectNodeList(doc, TABLEENTITY); 155 spatialRasterEntities = xpathapi.selectNodeList(doc, 156 SPATIALRASTERENTITY); 157 spatialVectorEntities = xpathapi.selectNodeList(doc, 158 SPATIALVECTORENTITY); 159 otherEntities = xpathapi.selectNodeList(doc, OTHERENTITY); 160 viewEntities = xpathapi.selectNodeList(doc, VIEWENTITY); 161 162 } catch (Exception e) { 163 throw new Exception( 164 "Error extracting entities from eml2.0.0 package."); 165 } 166 167 try { 168 log.debug("Processing entities"); 169 processEntities(xpathapi, entities, TABLEENTITY); 170 // TODO: current we still treat them as TableEntity java object, 171 // in future we need add new SpatialRasterEntity and SpatialVector 172 // object for them 173 processEntities(xpathapi, spatialRasterEntities, 174 SPATIALRASTERENTITY); 175 processEntities(xpathapi, spatialVectorEntities, 176 SPATIALVECTORENTITY); 177 processEntities(xpathapi, otherEntities, OTHERENTITY); 178 processEntities(xpathapi, viewEntities, VIEWENTITY); 179 log.debug("Done processing entities"); 180 } catch (Exception e) { 181 e.printStackTrace(); 182 throw new Exception("Error processing entities: " + e.getMessage()); 183 } 184 } 185 186 /** 187 * returns a hashtable of entity names hashed to the entity description 188 * metadata that goes with each entity. 189 */ 190 public Hashtable<String, Entity> getEntityHash() { 191 return entityHash; 192 } 193 194 195 /** 196 * Get a collection of entities. 197 * @return the collection of entities. 198 */ 199 public List<Entity> getEntities() { 200 return entityList; 201 } 202 203 /** 204 * Get the name space of the document 205 * @return the name space. If no name space, null will be returned. 206 */ 207 public String getNameSpace() { 208 return nameSpace; 209 } 210 211 /** 212 * returns the number of records in this dataItem 213 * 214 * @param entityId 215 * the id of the entity object to get the record count for 216 */ 217 public int getRecordCount(String entityId) { 218 return ((Entity) entityHash.get(entityId)).getNumRecords(); 219 } 220 221 /** 222 * returns the total number of entities in the data item collection that was 223 * passed to this class when the object was created. 224 */ 225 public int getEntityCount() { 226 return numEntities; 227 } 228 229 /** 230 * returns the number of attributes in the given entity 231 * 232 * @param entityId 233 * the id of the entity object that you want the attribute count 234 * for 235 */ 236 public int getAttributeCount(String entityId) { 237 Attribute[] attArray = ((Entity) entityHash.get(entityId)) 238 .getAttributes(); 239 return attArray.length; 240 } 241 242 /** 243 * if the entity has missing value declaretion 244 * 245 * */ 246 public boolean hasMissingValue() { 247 return hasMissingValue; 248 } 249 250 /** 251 * Method to get the boolean hasImageEntity. If the eml document has 252 * SpatialRaster or SpatialVector entity, this variable should be true; 253 * 254 * @return boolean 255 */ 256 /* 257 * public boolean getHasImageEntity() { return this.hasImageEntity; 258 * 259 * } 260 */ 261 262 /* 263 * Porcess the attribute list element 264 */ 265 private void processAttributeList(CachedXPathAPI xpathapi, 266 NodeList attList, Entity entObj) throws Exception { 267 AttributeList attributeList = new AttributeList(); 268 Node attListNode = attList.item(0); 269 // get attributeList element's attribute - id 270 NamedNodeMap idAttribute = attListNode.getAttributes(); 271 String idString = null; 272 if (idAttribute != null) { 273 Node id = idAttribute.getNamedItem("id"); 274 if (id != null) { 275 idString = id.getNodeValue(); 276 if (isDebugging) { 277 log.debug("The id value for the attributelist is " 278 + idString); 279 } 280 } 281 } 282 NodeList attNodeList = xpathapi 283 .selectNodeList(attListNode, "attribute"); 284 NodeList referenceNodeList = xpathapi.selectNodeList(attListNode, 285 "references"); 286 if (attNodeList != null && attNodeList.getLength() > 0) { 287 288 processAttributes(xpathapi, attNodeList, attributeList); 289 if (idString != null) { 290 attributeListHash.put(idString, attributeList); 291 292 } 293 } else if (referenceNodeList != null 294 && referenceNodeList.getLength() > 0) { 295 // get the references id 296 Node referenceNode = referenceNodeList.item(0); 297 if (isDebugging) { 298 log.debug("The reference node's name is " 299 + referenceNode.getNodeName()); 300 } 301 String referenceId = referenceNode.getFirstChild().getNodeValue(); 302 if (isDebugging) { 303 log.debug("the reference id is " + referenceId); 304 } 305 attributeList = attributeListHash.get(referenceId); 306 } else { 307 log 308 .debug("The children name of attribute list couldn't be understood"); 309 throw new Exception(" couldn't be a child of attributeList"); 310 } 311 312 if (!entityObject.isSimpleDelimited()) { 313 int length = attributeList.getAttributes().size(); 314 if (length != complexFormatsNumber 315 || (length == complexFormatsNumber && complexFormatsNumber == 0)) { 316 throw new Exception("Complex format elements should have" 317 + " some number as attribute number"); 318 } else { 319 // entityObject.setDataFormatArray(formatArray); 320 } 321 } 322 323 entityObject.setAttributeList(attributeList); 324 325 } 326 327 /** 328 * process the attributes 329 */ 330 private void processAttributes(CachedXPathAPI xpathapi, NodeList atts, 331 AttributeList attributeListObj) throws Exception { 332 333 for (int i = 0; i < atts.getLength(); i++) { // go through each 334 // attribute 335 Node att = atts.item(i); 336 NodeList attChildren = att.getChildNodes(); 337 NamedNodeMap attAttributes = att.getAttributes(); 338 339 String attName = ""; 340 String attLabel = ""; 341 String attDefinition = ""; 342 String attUnit = ""; 343 String attUnitType = ""; 344 String attStorageType = ""; 345 String attMeasurementScale = ""; 346 String attPrecision = ""; 347 Domain domain = null; 348 Vector missingCodeVector = new Vector(); 349 350 elementId++; 351 352 for (int j = 0; j < attChildren.getLength(); j++) { 353 Node child = attChildren.item(j); 354 String childName = child.getNodeName(); 355 if (childName.equals("attributeName")) { 356 attName = child.getFirstChild().getNodeValue().trim() 357 .replace('.', '_'); 358 } else if (childName.equals("attributeLabel")) { 359 attLabel = child.getFirstChild().getNodeValue().trim(); 360 } else if (childName.equals("attributeDefinition")) { 361 attDefinition = child.getFirstChild().getNodeValue().trim(); 362 } else if (childName.equals("measurementScale")) { 363 // unit is tricky because it can be custom or standard 364 // Vector info = new Vector(); 365 // int domainType = Domain.DOM_NONE; 366 NodeList msNodeList = child.getChildNodes(); 367 for (int k = 0; k < msNodeList.getLength(); k++) { 368 Node n = msNodeList.item(k); 369 String name = n.getNodeName(); 370 if (name.equals("interval") || name.equals("ratio")) { 371 String numberType = null; 372 String min = "", max = ""; 373 Node sUnit = xpathapi.selectSingleNode(n, 374 "unit/standardUnit"); 375 Node cUnit = xpathapi.selectSingleNode(n, 376 "unit/customUnit"); 377 if (sUnit != null) { 378 attUnit = sUnit.getFirstChild().getNodeValue(); 379 attUnitType = Attribute.STANDARDUNIT; 380 } else if (cUnit != null) { 381 attUnit = cUnit.getFirstChild().getNodeValue(); 382 attUnitType = Attribute.CUSTOMUNIT; 383 } else { 384 System.err.println("xpath didn't work"); 385 } 386 Node precision = xpathapi.selectSingleNode(n, 387 "precision"); 388 if (precision != null) { 389 // precision is optional in EML201 so if it is 390 // not provided, the attPrecision will be the 391 // empty string 392 attPrecision = precision.getFirstChild() 393 .getNodeValue(); 394 } 395 Node dNode = xpathapi.selectSingleNode(n, 396 "numericDomain"); 397 NodeList numberKids = dNode.getChildNodes(); 398 for (int index = 0; index < numberKids.getLength(); index++) { 399 400 String dName = numberKids.item(index) 401 .getNodeName(); 402 if (dName.equals("numberType")) // got number 403 // type 404 { 405 numberType = numberKids.item(index) 406 .getFirstChild().getNodeValue(); 407 if (isDebugging) { 408 log.debug("The number type is " 409 + numberType); 410 } 411 } else if (dName.equals("boundsGroup")) 412 // got bounds group 413 { 414 NodeList boundsList = xpathapi 415 .selectNodeList(dNode, "./bounds"); 416 for (i = 0; i < boundsList.getLength(); i++) { 417 NodeList nl; 418 Node bound; 419 420 String exclMin = null, exclMax = null; 421 try { 422 nl = xpathapi.selectNodeList( 423 boundsList.item(i), 424 "./minimum"); 425 bound = nl.item(0); 426 min = bound.getFirstChild() 427 .getNodeValue(); 428 exclMin = bound.getAttributes() 429 .getNamedItem("exclusive") 430 .getNodeValue(); 431 nl = xpathapi.selectNodeList( 432 boundsList.item(0), 433 "./maximum"); 434 bound = nl.item(0); 435 max = bound.getFirstChild() 436 .getNodeValue(); 437 exclMax = bound.getAttributes() 438 .getNamedItem("exclusive") 439 .getNodeValue(); 440 } catch (Exception e) { 441 log.debug("Error in handle bound ", 442 e); 443 } 444 } 445 446 } 447 448 } 449 Double minNum = null; 450 Double maxNum = null; 451 if (!min.trim().equals("") 452 && !max.trim().equals("")) { 453 minNum = new Double(min); 454 maxNum = new Double(max); 455 } 456 domain = new NumericDomain(numberType, minNum, 457 maxNum); 458 459 } else if (name.equals("nominal") 460 || name.equals("ordinal")) { 461 NodeList list = xpathapi.selectSingleNode(n, 462 "nonNumericDomain").getChildNodes(); 463 for (int m = 0; m < list.getLength(); m++) { 464 Node dNode = list.item(m); 465 String dName = dNode.getNodeName(); 466 if (dName.equals("textDomain")) { 467 TextDomain textDomain = new TextDomain(); 468 NodeList definitionL = xpathapi 469 .selectNodeList(dNode, 470 "./definition"); 471 Node defintionNode = definitionL.item(0); 472 String definition = defintionNode 473 .getFirstChild().getNodeValue(); 474 if (isDebugging) { 475 log.debug("The definition value is " 476 + definition); 477 } 478 textDomain.setDefinition(definition); 479 NodeList nl = xpathapi.selectNodeList( 480 dNode, "./pattern"); 481 String[] patternList = new String[nl 482 .getLength()]; 483 for (int l = 0; l < nl.getLength(); l++) { 484 patternList[l] = nl.item(l) 485 .getFirstChild().getNodeValue(); 486 } 487 if (patternList.length > 0) { 488 textDomain.setPattern(patternList); 489 } 490 domain = textDomain; 491 492 } else if (dName.equals("enumeratedDomain")) { 493 EnumeratedDomain enumerDomain = new EnumeratedDomain(); 494 Vector info = new Vector(); 495 NodeList nl = xpathapi.selectNodeList( 496 dNode, "./codeDefinition"); 497 for (int l = 0; l < nl.getLength(); l++) { 498 info.add(nl.item(l).getFirstChild() 499 .getNodeValue()); 500 } 501 enumerDomain.setInfo(info); 502 domain = enumerDomain; 503 504 } 505 } 506 507 } else if (name.equalsIgnoreCase("datetime")) { 508 DateTimeDomain date = new DateTimeDomain(); 509 String formatString = (xpathapi.selectSingleNode(n, 510 "./formatString")).getFirstChild() 511 .getNodeValue(); 512 if (isDebugging) { 513 log.debug("The format string in date time is " 514 + formatString); 515 } 516 date.setFormatString(formatString); 517 domain = date; 518 519 } 520 } 521 522 } else if (childName.equals("missingValueCode")) { 523 log.debug("in missilng valueCode"); 524 NodeList missingNodeList = child.getChildNodes(); 525 for (int k = 0; k < missingNodeList.getLength(); k++) { 526 Node n = missingNodeList.item(k); 527 String name = n.getNodeName(); 528 if (name.equals("code")) { 529 530 Node missingCodeTextNode = n.getFirstChild(); 531 if (missingCodeTextNode != null) { 532 String missingCode = missingCodeTextNode 533 .getNodeValue(); 534 if (isDebugging) { 535 log.debug("the missing code is " 536 + missingCode); 537 } 538 missingCodeVector.add(missingCode); 539 hasMissingValue = true; 540 } 541 } 542 } 543 544 } 545 } 546 547 String resolvedType; 548 DataType dataType = domain.getDataType(); 549 resolvedType = dataType.getName(); 550 if (isDebugging) { 551 log.debug("The final type is " + resolvedType); 552 } 553 554 //check for duplicates of this name 555 if (attributeListObj.containsNamedAttribute(attName)) { 556 int duplicateNameCounter = 1; 557 while (attributeListObj.containsNamedAttribute(attName)) { 558 attName += "_" + duplicateNameCounter; 559 duplicateNameCounter++; 560 } 561 } 562 563 Attribute attObj = new Attribute(Integer.toString(elementId), 564 attName, attLabel, attDefinition, attUnit, attUnitType, 565 resolvedType, attMeasurementScale, domain); 566 567 // add missing code into attribute 568 for (int k = 0; k < missingCodeVector.size(); k++) { 569 570 String missingCodeValue = (String) missingCodeVector 571 .elementAt(k); 572 if (isDebugging) { 573 log.debug("the mssing value code " + missingCodeValue 574 + " was added to attribute"); 575 } 576 attObj.addMissingValueCode(missingCodeValue); 577 } 578 579 attributeListObj.add(attObj); 580 581 } 582 } 583 584 /** 585 * pulls the entity information out of the xml and stores it in a hashtable 586 */ 587 private void processEntities(CachedXPathAPI xpathapi, NodeList entities, 588 String xpath) throws SAXException, 589 javax.xml.transform.TransformerException, Exception { 590 // make sure that entities is not null 591 if (entities == null) { 592 return; 593 } 594 int entityNodeListLength = entities.getLength(); 595 596 String entityName = ""; 597 String entityDescription = ""; 598 String entityOrientation = ""; 599 String entityCaseSensitive = ""; 600 String entityNumberOfRecords = "-1"; 601 String physicalFile = ""; 602 String numHeaderLines = "0"; 603 int numFooterLines = 0; 604 String fieldDelimiter = null; 605 String recordDelimiter = ""; 606 String compressionMethod = ""; 607 String encodingMethod = ""; 608 boolean isImageEntity = false; 609 boolean isGZipDataFile = false; 610 boolean isZipDataFile = false; 611 boolean isTarDataFile = false; 612 boolean isSimpleDelimited = true; 613 boolean isCollapseDelimiter = false; 614 TextComplexDataFormat[] formatArray = null; 615 int entityCounter = 0; 616 617 for (int i = 0; i < entityNodeListLength; i++) { 618 619 if (xpath != null 620 && (xpath.equals(SPATIALRASTERENTITY) || xpath 621 .equals(SPATIALVECTORENTITY))) { 622 isImageEntity = true; 623 } 624 // go through the entities and put the information into the hash. 625 elementId++; 626 Node entity = entities.item(i); 627 NodeList entityChildren = entity.getChildNodes(); 628 for (int j = 0; j < entityChildren.getLength(); j++) { 629 Node child = entityChildren.item(j); 630 String childName = child.getNodeName(); 631 632 if (childName.equals("entityName")) { 633 entityName = child.getFirstChild().getNodeValue(); 634 } else if (childName.equals("entityDescription")) { 635 entityDescription = child.getFirstChild().getNodeValue(); 636 } else if (childName.equals("caseSensitive")) { 637 entityCaseSensitive = child.getFirstChild().getNodeValue(); 638 } else if (childName.equals("numberOfRecords")) { 639 entityNumberOfRecords = child.getFirstChild() 640 .getNodeValue(); 641 if (entityNumberOfRecords != null) { 642 entityNumberOfRecords = entityNumberOfRecords.trim(); 643 numRecords = (new Integer(entityNumberOfRecords)) 644 .intValue(); 645 } 646 } 647 648 } 649 650 NodeList orientationNodeList = xpathapi.selectNodeList(entity, 651 "physical/dataFormat/textFormat/attributeOrientation"); 652 if (orientationNodeList != null 653 && orientationNodeList.getLength() > 0) { 654 entityOrientation = orientationNodeList.item(0).getFirstChild() 655 .getNodeValue(); 656 657 } 658 659 NodeList headerLinesNL = xpathapi.selectNodeList(entity, 660 "physical/dataFormat/textFormat/numHeaderLines"); 661 if ((headerLinesNL != null) && (headerLinesNL.getLength() > 0)) { 662 Node headerLinesNode = headerLinesNL.item(0); 663 if (headerLinesNode != null) { 664 numHeaderLines = headerLinesNode.getFirstChild() 665 .getNodeValue(); 666 } 667 } 668 669 NodeList footerLinesNL = xpathapi.selectNodeList(entity, 670 "physical/dataFormat/textFormat/numFooterLines"); 671 if ((footerLinesNL != null) && (footerLinesNL.getLength() > 0)) { 672 Node footerLinesNode = footerLinesNL.item(0); 673 if (footerLinesNode != null) { 674 String footerLineStr = footerLinesNode.getFirstChild() 675 .getNodeValue(); 676 numFooterLines = (new Integer(footerLineStr.trim())) 677 .intValue(); 678 } 679 } 680 681 // Here is the simple delimited data file 682 NodeList delimiterNL = xpathapi 683 .selectNodeList(entity, 684 "physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter"); 685 if (delimiterNL != null && delimiterNL.getLength() > 0) { 686 fieldDelimiter = delimiterNL.item(0).getFirstChild() 687 .getNodeValue(); 688 } 689 690 // Here is the simple delimited data file 691 NodeList collapseDelimiterNL = xpathapi 692 .selectNodeList(entity, 693 "physical/dataFormat/textFormat/simpleDelimited/collapseDelimiters"); 694 if (collapseDelimiterNL != null 695 && collapseDelimiterNL.getLength() > 0) { 696 String collapseDelimiter = collapseDelimiterNL.item(0) 697 .getFirstChild().getNodeValue(); 698 if (collapseDelimiter.equalsIgnoreCase("yes")) { 699 isCollapseDelimiter = true; 700 } 701 702 } 703 704 // for complex format data file 705 NodeList complexFormatNL = xpathapi.selectNodeList(entity, 706 "physical/dataFormat/textFormat/complex"); 707 if (complexFormatNL != null && complexFormatNL.getLength() > 0) { 708 log.debug("in handle complex text data format"); 709 isSimpleDelimited = false; 710 Node complexFormatNode = complexFormatNL.item(0); 711 NodeList complexFormatChildren = complexFormatNode 712 .getChildNodes(); 713 int childrenLength = complexFormatChildren.getLength(); 714 Vector formatVector = new Vector(); 715 for (int k = 0; k < childrenLength; k++) { 716 Node node = complexFormatChildren.item(k); 717 if (node != null && node.getNodeName().equals("textFixed")) { 718 TextWidthFixedDataFormat textFixedFormat = handleTextFixedDataFormatNode(node); 719 if (textFixedFormat != null) { 720 formatVector.add(textFixedFormat); 721 // complexFormatsNumber++; 722 } 723 724 } else if (node != null 725 && node.getNodeName().equals("textDelimited")) { 726 TextDelimitedDataFormat delimitedFormat = handleComplexDelimitedDataFormatNode(node); 727 if (delimitedFormat != null) { 728 formatVector.add(delimitedFormat); 729 // complexFormatsNumber++; 730 } 731 } 732 } 733 // transfer vector to array 734 complexFormatsNumber = formatVector.size(); 735 formatArray = new TextComplexDataFormat[complexFormatsNumber]; 736 for (int j = 0; j < complexFormatsNumber; j++) { 737 formatArray[j] = (TextComplexDataFormat) formatVector 738 .elementAt(j); 739 } 740 741 } 742 743 NodeList recDelimiterNL = xpathapi.selectNodeList(entity, 744 "physical/dataFormat/textFormat/recordDelimiter"); 745 if ((recDelimiterNL != null) && (recDelimiterNL.getLength() > 0)) { 746 recordDelimiter = recDelimiterNL.item(0).getFirstChild() 747 .getNodeValue(); 748 } else { 749 recordDelimiter = "\r\n"; 750 } 751 // get the distribution information 752 NodeList distributionNL = xpathapi.selectNodeList(entity, 753 "physical/distribution/online/url"); 754 if (distributionNL != null && distributionNL.getLength() > 0) { 755 physicalFile = distributionNL.item(0).getFirstChild() 756 .getNodeValue(); 757 if (isDebugging) { 758 log.debug("The url is " + physicalFile); 759 } 760 } 761 // if this url is donwloadable, if the value is "information", it is 762 // not downloadable 763 // otherwise, it is downloadable 764 Boolean isDownloadable = true; 765 NodeList distributionURLNL = xpathapi.selectNodeList(entity, 766 "physical/distribution/online/url/@function"); 767 if (distributionURLNL != null && distributionURLNL.getLength() > 0) { 768 String function = distributionURLNL.item(0).getNodeValue(); 769 log.debug("The function value is ============ " + function); 770 if (function != null && function.equals(INFORMATION)) { 771 isDownloadable = false; 772 } 773 } 774 775 // get the compressionMethod information 776 NodeList compressionNL = xpathapi.selectNodeList(entity, 777 "physical/compressionMethod"); 778 if (compressionNL != null && compressionNL.getLength() > 0) { 779 compressionMethod = compressionNL.item(0).getFirstChild() 780 .getNodeValue(); 781 if (isDebugging) { 782 log.debug("Compression method is " + compressionMethod); 783 } 784 if (compressionMethod != null 785 && compressionMethod.equals(Entity.GZIP)) { 786 isGZipDataFile = true; 787 } else if (compressionMethod != null 788 && compressionMethod.equals(Entity.ZIP)) { 789 isZipDataFile = true; 790 } 791 } 792 793 // get encoding method info (mainly for tar file) 794 NodeList encodingNL = xpathapi.selectNodeList(entity, 795 "physical/encodingMethod"); 796 if (encodingNL != null && encodingNL.getLength() > 0) { 797 encodingMethod = encodingNL.item(0).getFirstChild() 798 .getNodeValue(); 799 if (isDebugging) { 800 log.debug("encoding method is " + encodingMethod); 801 } 802 if (encodingMethod != null && encodingMethod.equals(Entity.TAR)) { 803 isTarDataFile = true; 804 } 805 } 806 807 if (entityOrientation.trim().equals("column")) { 808 entityOrientation = Entity.COLUMNMAJOR; 809 } else { 810 entityOrientation = Entity.ROWMAJOR; 811 } 812 813 if (entityCaseSensitive.equals("yes")) { 814 entityCaseSensitive = "true"; 815 } else { 816 entityCaseSensitive = "false"; 817 } 818 819 entityObject = new Entity(Integer.toString(elementId), entityName 820 .trim(), entityDescription.trim(), new Boolean( 821 entityCaseSensitive), entityOrientation, new Integer( 822 entityNumberOfRecords).intValue()); 823 entityObject.setNumHeaderLines((new Integer(numHeaderLines)) 824 .intValue()); 825 entityObject.setNumFooterLines(numFooterLines); 826 entityObject.setSimpleDelimited(isSimpleDelimited); 827 // for simple dimited data file 828 if (fieldDelimiter != null) { 829 entityObject.setDelimiter(fieldDelimiter); 830 } 831 entityObject.setCollaplseDelimiter(isCollapseDelimiter); 832 833 entityObject.setRecordDelimiter(recordDelimiter); 834 entityObject.setURL(physicalFile); 835 entityObject.setCompressionMethod(compressionMethod); 836 entityObject.setIsImageEntity(isImageEntity); 837 entityObject.setHasGZipDataFile(isGZipDataFile); 838 entityObject.setHasZipDataFile(isZipDataFile); 839 entityObject.setHasTarDataFile(isTarDataFile); 840 entityObject.setDownloadable(isDownloadable); 841 842 try { 843 NodeList attNL = xpathapi.selectNodeList(entity, 844 "attributeList"); 845 processAttributeList(xpathapi, attNL, entityObject); 846 entityObject.setDataFormatArray(formatArray); 847 848 } catch (Exception e) { 849 log.warn("Error parsing attributes: " 850 + e.getMessage()+ " So this entity "+entityObject.getName() +" may not have attribute list"); 851 } 852 if (entityObject.isDownloadable()) { 853 entityHash.put(Integer.toString(elementId), entityObject); 854 entityList.add(entityObject); 855 entityCounter++; 856 } 857 858 // fileHash.put(elementId, physicalFile); 859 860 } 861 numEntities = numEntities + entityCounter; 862 863 } 864 865 /* 866 * This method will digest a text fixed data format node and return a 867 * TextFixedDataFormat object. 868 */ 869 private TextWidthFixedDataFormat handleTextFixedDataFormatNode(Node node) 870 throws Exception { 871 TextWidthFixedDataFormat format = null; 872 if (node == null) { 873 return format; 874 } 875 NodeList children = node.getChildNodes(); 876 int length = children.getLength(); 877 for (int i = 0; i < length; i++) { 878 Node kid = children.item(i); 879 String elementName = kid.getNodeName(); 880 if (elementName != null && elementName.equals("fieldWidth")) { 881 String fieldWidthStr = kid.getFirstChild().getNodeValue(); 882 883 int fieldWidth = (new Integer(fieldWidthStr)).intValue(); 884 if (isDebugging) { 885 log.debug("The filed width for fix width in eml is " 886 + fieldWidth); 887 } 888 format = new TextWidthFixedDataFormat(fieldWidth); 889 } else if (elementName != null 890 && elementName.equals("fieldStartColumn") && format != null) { 891 String startColumnStr = kid.getFirstChild().getNodeValue(); 892 int startColumn = (new Integer(startColumnStr)).intValue(); 893 if (isDebugging) { 894 log.debug("The start column is " + startColumn); 895 } 896 format.setFieldStartColumn(startColumn); 897 } else if (elementName != null && elementName.equals("lineNumber") 898 && format != null) { 899 String lineNumberStr = kid.getFirstChild().getNodeValue(); 900 int lineNumber = (new Integer(lineNumberStr)).intValue(); 901 if (isDebugging) { 902 log.debug("The start column is " + lineNumber); 903 } 904 format.setLineNumber(lineNumber); 905 } 906 } 907 return format; 908 } 909 910 /* 911 * This method will digest a delimited data format node and return a 912 * DelimitedFixedFormat object. 913 */ 914 private TextDelimitedDataFormat handleComplexDelimitedDataFormatNode( 915 Node node) throws Exception { 916 TextDelimitedDataFormat format = null; 917 if (node == null) { 918 return format; 919 } 920 NodeList children = node.getChildNodes(); 921 int length = children.getLength(); 922 Vector quoteList = new Vector(); 923 for (int i = 0; i < length; i++) { 924 Node kid = children.item(i); 925 String elementName = kid.getNodeName(); 926 if (elementName != null && elementName.equals("fieldDelimiter")) { 927 String fieldDelimiter = kid.getFirstChild().getNodeValue(); 928 if (isDebugging) { 929 log 930 .debug("The filed delimiter for complex format in eml is " 931 + fieldDelimiter); 932 } 933 format = new TextDelimitedDataFormat(fieldDelimiter); 934 } else if (elementName != null && elementName.equals("lineNumber") 935 && format != null) { 936 String lineNumberStr = kid.getFirstChild().getNodeValue(); 937 int lineNumber = (new Integer(lineNumberStr)).intValue(); 938 if (isDebugging) { 939 log.debug("The line number is " + lineNumber); 940 } 941 format.setLineNumber(lineNumber); 942 } else if (elementName != null 943 && elementName.equals("collapseDelimiter") 944 && format != null) { 945 String collapse = kid.getFirstChild().getNodeValue(); 946 if (isDebugging) { 947 log.debug("The collapse delimiter " + collapse); 948 } 949 format.setCollapseDelimiter(collapse); 950 } else if (elementName != null 951 && elementName.equals("quoteCharacter") && format != null) { 952 String quote = kid.getFirstChild().getNodeValue(); 953 quoteList.add(quote); 954 955 } 956 957 } 958 // set up quoteList 959 if (format != null) { 960 int size = quoteList.size(); 961 String[] quoteArray = new String[size]; 962 for (int i = 0; i < size; i++) { 963 quoteArray[i] = (String) quoteList.elementAt(i); 964 } 965 format.setQuoteCharater(quoteArray); 966 } 967 return format; 968 } 969}