001/*
002 * Copyright (c) 2003-2010 The Regents of the University of California.
003 * All rights reserved.
004 *
005 * '$Author: crawl $'
006 * '$Date: 2012-11-26 22:19:36 +0000 (Mon, 26 Nov 2012) $' 
007 * '$Revision: 31113 $'
008 * 
009 * Permission is hereby granted, without written agreement and without
010 * license or royalty fees, to use, copy, modify, and distribute this
011 * software and its documentation for any purpose, provided that the above
012 * copyright notice and the following two paragraphs appear in all copies
013 * of this software.
014 *
015 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
016 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
017 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
018 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
019 * SUCH DAMAGE.
020 *
021 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
022 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
023 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
024 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
025 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
026 * ENHANCEMENTS, OR MODIFICATIONS.
027 *
028 */
029
030package org.ecoinformatics.seek.datasource.eml.eml2;
031
032import java.io.InputStream;
033import java.util.Hashtable;
034import java.util.List;
035import java.util.Vector;
036
037import javax.xml.parsers.DocumentBuilder;
038import javax.xml.parsers.DocumentBuilderFactory;
039
040import org.apache.commons.logging.Log;
041import org.apache.commons.logging.LogFactory;
042import org.apache.xpath.CachedXPathAPI;
043import org.kepler.metadata.ParserInterface;
044import org.kepler.objectmanager.data.DataType;
045import org.kepler.objectmanager.data.DataTypeResolver;
046import org.kepler.objectmanager.data.DateTimeDomain;
047import org.kepler.objectmanager.data.Domain;
048import org.kepler.objectmanager.data.EnumeratedDomain;
049import org.kepler.objectmanager.data.NumericDomain;
050import org.kepler.objectmanager.data.db.Attribute;
051import org.kepler.objectmanager.data.db.AttributeList;
052import org.kepler.objectmanager.data.db.Entity;
053import org.kepler.objectmanager.data.text.TextComplexDataFormat;
054import org.kepler.objectmanager.data.text.TextDelimitedDataFormat;
055import org.kepler.objectmanager.data.text.TextDomain;
056import org.kepler.objectmanager.data.text.TextWidthFixedDataFormat;
057import org.w3c.dom.Document;
058import org.w3c.dom.Element;
059import org.w3c.dom.NamedNodeMap;
060import org.w3c.dom.Node;
061import org.w3c.dom.NodeList;
062import org.xml.sax.InputSource;
063import org.xml.sax.SAXException;
064
065/**
066 * This plugin parses EML 2.0.0 metadata files
067 */
068public class Eml200Parser implements ParserInterface {
069
070        //private static String NAMESPACE = "eml://ecoinformatics.org/eml-2.0.0";
071  private String nameSpace = null;
072  private Hashtable<String, Entity> entityHash = new Hashtable<String, Entity>();
073  private Vector<Entity> entityList = new Vector<Entity>();//this one will preserve the order of the entity.
074        // private Hashtable fileHash = new Hashtable();
075        private int numEntities = 0;
076        private int numRecords = -1;
077        private Entity entityObject = null;
078        private DataTypeResolver dtr = DataTypeResolver.instanceOf();
079        private int elementId = 0;
080        // private boolean hasImageEntity = false;
081        private int complexFormatsNumber = 0;
082        private Hashtable<String, AttributeList> attributeListHash = new Hashtable<String, AttributeList>();
083        private boolean hasMissingValue = false;
084
085        private static Log log;
086        private static boolean isDebugging;
087
088        static {
089                log = LogFactory
090                                .getLog("org.ecoinformatics.seek.datasource.eml.eml2.Eml200Parser");
091                isDebugging = log.isDebugEnabled();
092        }
093
094        // constants
095        public static final String TABLEENTITY = "//dataset/dataTable";
096        public static final String SPATIALRASTERENTITY = "//dataset/spatialRaster";
097        public static final String SPATIALVECTORENTITY = "//dataset/spatialVector";
098        public static final String STOREDPROCEDUREENTITY = "//dataset/storedProcedure";
099        public static final String VIEWENTITY = "//dataset/view";
100        public static final String OTHERENTITY = "//dataset/otherEntity";
101        public static final String EML = "eml";
102  public static final String PACKAGEID="packageId";
103        private static final String INFORMATION = "information";
104
105        /**
106         * returns a hashtable of with the id of the entity as the key and the data
107         * file id to which the entity refers as the value. This way, if you want to
108         * know what data file goes with an entity, you can do a get on this hash
109         * for the id of the entity. note that the entity id is the xml entity id
110         * from the generated input step, not the id of the entity file itself.
111         */
112        /*
113         * public Hashtable getDataFilesHash() { return fileHash; }
114         */
115
116        /**
117         * parses the EML package using an InputSource
118         */
119        public void parse(InputSource source) throws Exception {
120          DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
121          factory.setNamespaceAware(true);
122                DocumentBuilder builder = factory.newDocumentBuilder();
123                Document doc = builder.parse(source);
124                parseDocument(doc);
125        }
126
127        /**
128         * parses the EML package using an InputStream
129         */
130        public void parse(InputStream is) throws Exception {
131          DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
132    factory.setNamespaceAware(true);
133    DocumentBuilder builder = factory.newDocumentBuilder();
134                Document doc = builder.parse(is);
135                parseDocument(doc);
136        }
137
138        /*
139         * parses the EML document. Now except dataTable, spatialRaster and
140         * spatialVector entities are added.
141         */
142        private void parseDocument(Document doc) throws Exception {
143                NodeList entities;
144                NodeList spatialRasterEntities;
145                NodeList spatialVectorEntities;
146                NodeList otherEntities;
147                NodeList viewEntities;
148                Element root = doc.getDocumentElement();
149                nameSpace = root.getNamespaceURI();
150                //System.out.println("name space is ==== in the document "+nameSpace);
151                CachedXPathAPI xpathapi = new CachedXPathAPI();
152                try {
153                        // now dataTable, spatialRaster and spatialVector are handled
154                        entities = xpathapi.selectNodeList(doc, TABLEENTITY);
155                        spatialRasterEntities = xpathapi.selectNodeList(doc,
156                                        SPATIALRASTERENTITY);
157                        spatialVectorEntities = xpathapi.selectNodeList(doc,
158                                        SPATIALVECTORENTITY);
159                        otherEntities = xpathapi.selectNodeList(doc, OTHERENTITY);
160                        viewEntities = xpathapi.selectNodeList(doc, VIEWENTITY);
161
162                } catch (Exception e) {
163                        throw new Exception(
164                                        "Error extracting entities from eml2.0.0 package.");
165                }
166
167                try {
168                        log.debug("Processing entities");
169                        processEntities(xpathapi, entities, TABLEENTITY);
170                        // TODO: current we still treat them as TableEntity java object,
171                        // in future we need add new SpatialRasterEntity and SpatialVector
172                        // object for them
173                        processEntities(xpathapi, spatialRasterEntities,
174                                        SPATIALRASTERENTITY);
175                        processEntities(xpathapi, spatialVectorEntities,
176                                        SPATIALVECTORENTITY);
177                        processEntities(xpathapi, otherEntities, OTHERENTITY);
178                        processEntities(xpathapi, viewEntities, VIEWENTITY);
179                        log.debug("Done processing entities");
180                } catch (Exception e) {
181      e.printStackTrace();
182                        throw new Exception("Error processing entities: " + e.getMessage());
183                }
184        }
185
186        /**
187         * returns a hashtable of entity names hashed to the entity description
188         * metadata that goes with each entity.
189         */
190        public Hashtable<String, Entity> getEntityHash() {
191                return entityHash;
192        }
193        
194        
195        /**
196         * Get a collection of entities.
197         * @return the collection of entities.
198         */
199        public List<Entity> getEntities() {
200          return entityList;
201        }
202        
203        /**
204         * Get the name space of the document
205         * @return the name space. If no name space, null will be returned.
206         */
207        public String getNameSpace() {
208          return nameSpace;
209        }
210
211        /**
212         * returns the number of records in this dataItem
213         * 
214         * @param entityId
215         *            the id of the entity object to get the record count for
216         */
217        public int getRecordCount(String entityId) {
218                return ((Entity) entityHash.get(entityId)).getNumRecords();
219        }
220
221        /**
222         * returns the total number of entities in the data item collection that was
223         * passed to this class when the object was created.
224         */
225        public int getEntityCount() {
226                return numEntities;
227        }
228
229        /**
230         * returns the number of attributes in the given entity
231         * 
232         * @param entityId
233         *            the id of the entity object that you want the attribute count
234         *            for
235         */
236        public int getAttributeCount(String entityId) {
237                Attribute[] attArray = ((Entity) entityHash.get(entityId))
238                                .getAttributes();
239                return attArray.length;
240        }
241
242        /**
243         * if the entity has missing value declaretion
244         * 
245         *       */
246        public boolean hasMissingValue() {
247                return hasMissingValue;
248        }
249
250        /**
251         * Method to get the boolean hasImageEntity. If the eml document has
252         * SpatialRaster or SpatialVector entity, this variable should be true;
253         * 
254         * @return boolean
255         */
256        /*
257         * public boolean getHasImageEntity() { return this.hasImageEntity;
258         * 
259         * }
260         */
261
262        /*
263         * Porcess the attribute list element
264         */
265        private void processAttributeList(CachedXPathAPI xpathapi,
266                        NodeList attList, Entity entObj) throws Exception {
267                AttributeList attributeList = new AttributeList();
268                Node attListNode = attList.item(0);
269                // get attributeList element's attribute - id
270                NamedNodeMap idAttribute = attListNode.getAttributes();
271                String idString = null;
272                if (idAttribute != null) {
273                        Node id = idAttribute.getNamedItem("id");
274                        if (id != null) {
275                                idString = id.getNodeValue();
276                                if (isDebugging) {
277                                        log.debug("The id value for the attributelist is "
278                                                        + idString);
279                                }
280                        }
281                }
282                NodeList attNodeList = xpathapi
283                                .selectNodeList(attListNode, "attribute");
284                NodeList referenceNodeList = xpathapi.selectNodeList(attListNode,
285                                "references");
286                if (attNodeList != null && attNodeList.getLength() > 0) {
287
288                        processAttributes(xpathapi, attNodeList, attributeList);
289                        if (idString != null) {
290                                attributeListHash.put(idString, attributeList);
291
292                        }
293                } else if (referenceNodeList != null
294                                && referenceNodeList.getLength() > 0) {
295                        // get the references id
296                        Node referenceNode = referenceNodeList.item(0);
297                        if (isDebugging) {
298                                log.debug("The reference node's name is "
299                                                + referenceNode.getNodeName());
300                        }
301                        String referenceId = referenceNode.getFirstChild().getNodeValue();
302                        if (isDebugging) {
303                                log.debug("the reference id is " + referenceId);
304                        }
305                        attributeList = attributeListHash.get(referenceId);
306                } else {
307                        log
308                                        .debug("The children name of attribute list couldn't be understood");
309                        throw new Exception(" couldn't be a child of attributeList");
310                }
311
312                if (!entityObject.isSimpleDelimited()) {
313                        int length = attributeList.getAttributes().size();
314                        if (length != complexFormatsNumber
315                                        || (length == complexFormatsNumber && complexFormatsNumber == 0)) {
316                                throw new Exception("Complex format elements should have"
317                                                + " some number as attribute number");
318                        } else {
319                                // entityObject.setDataFormatArray(formatArray);
320                        }
321                }
322
323                entityObject.setAttributeList(attributeList);
324
325        }
326
327        /**
328         * process the attributes
329         */
330        private void processAttributes(CachedXPathAPI xpathapi, NodeList atts,
331                        AttributeList attributeListObj) throws Exception {
332
333                for (int i = 0; i < atts.getLength(); i++) { // go through each
334                        // attribute
335                        Node att = atts.item(i);
336                        NodeList attChildren = att.getChildNodes();
337                        NamedNodeMap attAttributes = att.getAttributes();
338
339                        String attName = "";
340                        String attLabel = "";
341                        String attDefinition = "";
342                        String attUnit = "";
343                        String attUnitType = "";
344                        String attStorageType = "";
345                        String attMeasurementScale = "";
346                        String attPrecision = "";
347                        Domain domain = null;
348                        Vector missingCodeVector = new Vector();
349
350                        elementId++;
351
352                        for (int j = 0; j < attChildren.getLength(); j++) {
353                                Node child = attChildren.item(j);
354                                String childName = child.getNodeName();
355                                if (childName.equals("attributeName")) {
356                                        attName = child.getFirstChild().getNodeValue().trim()
357                                                        .replace('.', '_');
358                                } else if (childName.equals("attributeLabel")) {
359                                        attLabel = child.getFirstChild().getNodeValue().trim();
360                                } else if (childName.equals("attributeDefinition")) {
361                                        attDefinition = child.getFirstChild().getNodeValue().trim();
362                                } else if (childName.equals("measurementScale")) {
363                                        // unit is tricky because it can be custom or standard
364                                        // Vector info = new Vector();
365                                        // int domainType = Domain.DOM_NONE;
366                                        NodeList msNodeList = child.getChildNodes();
367                                        for (int k = 0; k < msNodeList.getLength(); k++) {
368                                                Node n = msNodeList.item(k);
369                                                String name = n.getNodeName();
370                                                if (name.equals("interval") || name.equals("ratio")) {
371                                                        String numberType = null;
372                                                        String min = "", max = "";
373                                                        Node sUnit = xpathapi.selectSingleNode(n,
374                                                                        "unit/standardUnit");
375                                                        Node cUnit = xpathapi.selectSingleNode(n,
376                                                                        "unit/customUnit");
377                                                        if (sUnit != null) {
378                                                                attUnit = sUnit.getFirstChild().getNodeValue();
379                                                                attUnitType = Attribute.STANDARDUNIT;
380                                                        } else if (cUnit != null) {
381                                                                attUnit = cUnit.getFirstChild().getNodeValue();
382                                                                attUnitType = Attribute.CUSTOMUNIT;
383                                                        } else {
384                                                                System.err.println("xpath didn't work");
385                                                        }
386                                                        Node precision = xpathapi.selectSingleNode(n,
387                                                                        "precision");
388                                                        if (precision != null) {
389                                                                // precision is optional in EML201 so if it is
390                                                                // not provided, the attPrecision will be the
391                                                                // empty string
392                                                                attPrecision = precision.getFirstChild()
393                                                                                .getNodeValue();
394                                                        }
395                                                        Node dNode = xpathapi.selectSingleNode(n,
396                                                                        "numericDomain");
397                                                        NodeList numberKids = dNode.getChildNodes();
398                                                        for (int index = 0; index < numberKids.getLength(); index++) {
399
400                                                                String dName = numberKids.item(index)
401                                                                                .getNodeName();
402                                                                if (dName.equals("numberType")) // got number
403                                                                // type
404                                                                {
405                                                                        numberType = numberKids.item(index)
406                                                                                        .getFirstChild().getNodeValue();
407                                                                        if (isDebugging) {
408                                                                                log.debug("The number type is "
409                                                                                                + numberType);
410                                                                        }
411                                                                } else if (dName.equals("boundsGroup"))
412                                                                // got bounds group
413                                                                {
414                                                                        NodeList boundsList = xpathapi
415                                                                                        .selectNodeList(dNode, "./bounds");
416                                                                        for (i = 0; i < boundsList.getLength(); i++) {
417                                                                                NodeList nl;
418                                                                                Node bound;
419
420                                                                                String exclMin = null, exclMax = null;
421                                                                                try {
422                                                                                        nl = xpathapi.selectNodeList(
423                                                                                                        boundsList.item(i),
424                                                                                                        "./minimum");
425                                                                                        bound = nl.item(0);
426                                                                                        min = bound.getFirstChild()
427                                                                                                        .getNodeValue();
428                                                                                        exclMin = bound.getAttributes()
429                                                                                                        .getNamedItem("exclusive")
430                                                                                                        .getNodeValue();
431                                                                                        nl = xpathapi.selectNodeList(
432                                                                                                        boundsList.item(0),
433                                                                                                        "./maximum");
434                                                                                        bound = nl.item(0);
435                                                                                        max = bound.getFirstChild()
436                                                                                                        .getNodeValue();
437                                                                                        exclMax = bound.getAttributes()
438                                                                                                        .getNamedItem("exclusive")
439                                                                                                        .getNodeValue();
440                                                                                } catch (Exception e) {
441                                                                                        log.debug("Error in handle bound ",
442                                                                                                        e);
443                                                                                }
444                                                                        }
445
446                                                                }
447
448                                                        }
449                                                        Double minNum = null;
450                                                        Double maxNum = null;
451                                                        if (!min.trim().equals("")
452                                                                        && !max.trim().equals("")) {
453                                                                minNum = new Double(min);
454                                                                maxNum = new Double(max);
455                                                        }
456                                                        domain = new NumericDomain(numberType, minNum,
457                                                                        maxNum);
458
459                                                } else if (name.equals("nominal")
460                                                                || name.equals("ordinal")) {
461                                                        NodeList list = xpathapi.selectSingleNode(n,
462                                                                        "nonNumericDomain").getChildNodes();
463                                                        for (int m = 0; m < list.getLength(); m++) {
464                                                                Node dNode = list.item(m);
465                                                                String dName = dNode.getNodeName();
466                                                                if (dName.equals("textDomain")) {
467                                                                        TextDomain textDomain = new TextDomain();
468                                                                        NodeList definitionL = xpathapi
469                                                                                        .selectNodeList(dNode,
470                                                                                                        "./definition");
471                                                                        Node defintionNode = definitionL.item(0);
472                                                                        String definition = defintionNode
473                                                                                        .getFirstChild().getNodeValue();
474                                                                        if (isDebugging) {
475                                                                                log.debug("The definition value is "
476                                                                                                + definition);
477                                                                        }
478                                                                        textDomain.setDefinition(definition);
479                                                                        NodeList nl = xpathapi.selectNodeList(
480                                                                                        dNode, "./pattern");
481                                                                        String[] patternList = new String[nl
482                                                                                        .getLength()];
483                                                                        for (int l = 0; l < nl.getLength(); l++) {
484                                                                                patternList[l] = nl.item(l)
485                                                                                                .getFirstChild().getNodeValue();
486                                                                        }
487                                                                        if (patternList.length > 0) {
488                                                                                textDomain.setPattern(patternList);
489                                                                        }
490                                                                        domain = textDomain;
491
492                                                                } else if (dName.equals("enumeratedDomain")) {
493                                                                        EnumeratedDomain enumerDomain = new EnumeratedDomain();
494                                                                        Vector info = new Vector();
495                                                                        NodeList nl = xpathapi.selectNodeList(
496                                                                                        dNode, "./codeDefinition");
497                                                                        for (int l = 0; l < nl.getLength(); l++) {
498                                                                                info.add(nl.item(l).getFirstChild()
499                                                                                                .getNodeValue());
500                                                                        }
501                                                                        enumerDomain.setInfo(info);
502                                                                        domain = enumerDomain;
503
504                                                                }
505                                                        }
506
507                                                } else if (name.equalsIgnoreCase("datetime")) {
508                                                        DateTimeDomain date = new DateTimeDomain();
509                                                        String formatString = (xpathapi.selectSingleNode(n,
510                                                                        "./formatString")).getFirstChild()
511                                                                        .getNodeValue();
512                                                        if (isDebugging) {
513                                                                log.debug("The format string in date time is "
514                                                                                + formatString);
515                                                        }
516                                                        date.setFormatString(formatString);
517                                                        domain = date;
518
519                                                }
520                                        }
521
522                                } else if (childName.equals("missingValueCode")) {
523                                        log.debug("in missilng valueCode");
524                                        NodeList missingNodeList = child.getChildNodes();
525                                        for (int k = 0; k < missingNodeList.getLength(); k++) {
526                                                Node n = missingNodeList.item(k);
527                                                String name = n.getNodeName();
528                                                if (name.equals("code")) {
529
530                                                        Node missingCodeTextNode = n.getFirstChild();
531                                                        if (missingCodeTextNode != null) {
532                                                                String missingCode = missingCodeTextNode
533                                                                                .getNodeValue();
534                                                                if (isDebugging) {
535                                                                        log.debug("the missing code is "
536                                                                                        + missingCode);
537                                                                }
538                                                                missingCodeVector.add(missingCode);
539                                                                hasMissingValue = true;
540                                                        }
541                                                }
542                                        }
543
544                                }
545                        }
546
547                        String resolvedType;
548                        DataType dataType = domain.getDataType();
549                        resolvedType = dataType.getName();
550                        if (isDebugging) {
551                                log.debug("The final type is " + resolvedType);
552                        }
553                        
554                        //check for duplicates of this name
555                        if (attributeListObj.containsNamedAttribute(attName)) {
556                                int duplicateNameCounter = 1;
557                                while (attributeListObj.containsNamedAttribute(attName)) {
558                                        attName += "_" + duplicateNameCounter;
559                                        duplicateNameCounter++;
560                                }
561                        }
562
563                        Attribute attObj = new Attribute(Integer.toString(elementId),
564                                        attName, attLabel, attDefinition, attUnit, attUnitType,
565                                        resolvedType, attMeasurementScale, domain);
566
567                        // add missing code into attribute
568                        for (int k = 0; k < missingCodeVector.size(); k++) {
569
570                                String missingCodeValue = (String) missingCodeVector
571                                                .elementAt(k);
572                                if (isDebugging) {
573                                        log.debug("the mssing value code " + missingCodeValue
574                                                        + " was added to attribute");
575                                }
576                                attObj.addMissingValueCode(missingCodeValue);
577                        }
578
579                        attributeListObj.add(attObj);
580
581                }
582        }
583
584        /**
585         * pulls the entity information out of the xml and stores it in a hashtable
586         */
587        private void processEntities(CachedXPathAPI xpathapi, NodeList entities,
588                        String xpath) throws SAXException,
589                        javax.xml.transform.TransformerException, Exception {
590                // make sure that entities is not null
591                if (entities == null) {
592                        return;
593                }
594                int entityNodeListLength = entities.getLength();
595
596                String entityName = "";
597                String entityDescription = "";
598                String entityOrientation = "";
599                String entityCaseSensitive = "";
600                String entityNumberOfRecords = "-1";
601                String physicalFile = "";
602                String numHeaderLines = "0";
603                int numFooterLines = 0;
604                String fieldDelimiter = null;
605                String recordDelimiter = "";
606                String compressionMethod = "";
607                String encodingMethod = "";
608                boolean isImageEntity = false;
609                boolean isGZipDataFile = false;
610                boolean isZipDataFile = false;
611                boolean isTarDataFile = false;
612                boolean isSimpleDelimited = true;
613                boolean isCollapseDelimiter = false;
614                TextComplexDataFormat[] formatArray = null;
615                int entityCounter = 0;
616
617                for (int i = 0; i < entityNodeListLength; i++) {
618
619                        if (xpath != null
620                                        && (xpath.equals(SPATIALRASTERENTITY) || xpath
621                                                        .equals(SPATIALVECTORENTITY))) {
622                                isImageEntity = true;
623                        }
624                        // go through the entities and put the information into the hash.
625                        elementId++;
626                        Node entity = entities.item(i);
627                        NodeList entityChildren = entity.getChildNodes();
628                        for (int j = 0; j < entityChildren.getLength(); j++) {
629                                Node child = entityChildren.item(j);
630                                String childName = child.getNodeName();
631
632                                if (childName.equals("entityName")) {
633                                        entityName = child.getFirstChild().getNodeValue();
634                                } else if (childName.equals("entityDescription")) {
635                                        entityDescription = child.getFirstChild().getNodeValue();
636                                } else if (childName.equals("caseSensitive")) {
637                                        entityCaseSensitive = child.getFirstChild().getNodeValue();
638                                } else if (childName.equals("numberOfRecords")) {
639                                        entityNumberOfRecords = child.getFirstChild()
640                                                        .getNodeValue();
641                                        if (entityNumberOfRecords != null) {
642                                                entityNumberOfRecords = entityNumberOfRecords.trim();
643                                                numRecords = (new Integer(entityNumberOfRecords))
644                                                                .intValue();
645                                        }
646                                }
647
648                        }
649
650                        NodeList orientationNodeList = xpathapi.selectNodeList(entity,
651                                        "physical/dataFormat/textFormat/attributeOrientation");
652                        if (orientationNodeList != null
653                                        && orientationNodeList.getLength() > 0) {
654                                entityOrientation = orientationNodeList.item(0).getFirstChild()
655                                                .getNodeValue();
656
657                        }
658
659                        NodeList headerLinesNL = xpathapi.selectNodeList(entity,
660                                        "physical/dataFormat/textFormat/numHeaderLines");
661                        if ((headerLinesNL != null) && (headerLinesNL.getLength() > 0)) {
662                                Node headerLinesNode = headerLinesNL.item(0);
663                                if (headerLinesNode != null) {
664                                        numHeaderLines = headerLinesNode.getFirstChild()
665                                                        .getNodeValue();
666                                }
667                        }
668
669                        NodeList footerLinesNL = xpathapi.selectNodeList(entity,
670                                        "physical/dataFormat/textFormat/numFooterLines");
671                        if ((footerLinesNL != null) && (footerLinesNL.getLength() > 0)) {
672                                Node footerLinesNode = footerLinesNL.item(0);
673                                if (footerLinesNode != null) {
674                                        String footerLineStr = footerLinesNode.getFirstChild()
675                                                        .getNodeValue();
676                                        numFooterLines = (new Integer(footerLineStr.trim()))
677                                                        .intValue();
678                                }
679                        }
680
681                        // Here is the simple delimited data file
682                        NodeList delimiterNL = xpathapi
683                                        .selectNodeList(entity,
684                                                        "physical/dataFormat/textFormat/simpleDelimited/fieldDelimiter");
685                        if (delimiterNL != null && delimiterNL.getLength() > 0) {
686                                fieldDelimiter = delimiterNL.item(0).getFirstChild()
687                                                .getNodeValue();
688                        }
689
690                        // Here is the simple delimited data file
691                        NodeList collapseDelimiterNL = xpathapi
692                                        .selectNodeList(entity,
693                                                        "physical/dataFormat/textFormat/simpleDelimited/collapseDelimiters");
694                        if (collapseDelimiterNL != null
695                                        && collapseDelimiterNL.getLength() > 0) {
696                                String collapseDelimiter = collapseDelimiterNL.item(0)
697                                                .getFirstChild().getNodeValue();
698                                if (collapseDelimiter.equalsIgnoreCase("yes")) {
699                                        isCollapseDelimiter = true;
700                                }
701
702                        }
703
704                        // for complex format data file
705                        NodeList complexFormatNL = xpathapi.selectNodeList(entity,
706                                        "physical/dataFormat/textFormat/complex");
707                        if (complexFormatNL != null && complexFormatNL.getLength() > 0) {
708                                log.debug("in handle complex text data format");
709                                isSimpleDelimited = false;
710                                Node complexFormatNode = complexFormatNL.item(0);
711                                NodeList complexFormatChildren = complexFormatNode
712                                                .getChildNodes();
713                                int childrenLength = complexFormatChildren.getLength();
714                                Vector formatVector = new Vector();
715                                for (int k = 0; k < childrenLength; k++) {
716                                        Node node = complexFormatChildren.item(k);
717                                        if (node != null && node.getNodeName().equals("textFixed")) {
718                                                TextWidthFixedDataFormat textFixedFormat = handleTextFixedDataFormatNode(node);
719                                                if (textFixedFormat != null) {
720                                                        formatVector.add(textFixedFormat);
721                                                        // complexFormatsNumber++;
722                                                }
723
724                                        } else if (node != null
725                                                        && node.getNodeName().equals("textDelimited")) {
726                                                TextDelimitedDataFormat delimitedFormat = handleComplexDelimitedDataFormatNode(node);
727                                                if (delimitedFormat != null) {
728                                                        formatVector.add(delimitedFormat);
729                                                        // complexFormatsNumber++;
730                                                }
731                                        }
732                                }
733                                // transfer vector to array
734                                complexFormatsNumber = formatVector.size();
735                                formatArray = new TextComplexDataFormat[complexFormatsNumber];
736                                for (int j = 0; j < complexFormatsNumber; j++) {
737                                        formatArray[j] = (TextComplexDataFormat) formatVector
738                                                        .elementAt(j);
739                                }
740
741                        }
742
743                        NodeList recDelimiterNL = xpathapi.selectNodeList(entity,
744                                        "physical/dataFormat/textFormat/recordDelimiter");
745                        if ((recDelimiterNL != null) && (recDelimiterNL.getLength() > 0)) {
746                                recordDelimiter = recDelimiterNL.item(0).getFirstChild()
747                                                .getNodeValue();
748                        } else {
749                                recordDelimiter = "\r\n";
750                        }
751                        // get the distribution information
752                        NodeList distributionNL = xpathapi.selectNodeList(entity,
753                                        "physical/distribution/online/url");
754                        if (distributionNL != null && distributionNL.getLength() > 0) {
755                                physicalFile = distributionNL.item(0).getFirstChild()
756                                                .getNodeValue();
757                                if (isDebugging) {
758                                        log.debug("The url is " + physicalFile);
759                                }
760                        }
761                        // if this url is donwloadable, if the value is "information", it is
762                        // not downloadable
763                        // otherwise, it is downloadable
764                        Boolean isDownloadable = true;
765                        NodeList distributionURLNL = xpathapi.selectNodeList(entity,
766                                        "physical/distribution/online/url/@function");
767                        if (distributionURLNL != null && distributionURLNL.getLength() > 0) {
768                                String function = distributionURLNL.item(0).getNodeValue();
769                                log.debug("The function value is ============ " + function);
770                                if (function != null && function.equals(INFORMATION)) {
771                                        isDownloadable = false;
772                                }
773                        }
774
775                        // get the compressionMethod information
776                        NodeList compressionNL = xpathapi.selectNodeList(entity,
777                                        "physical/compressionMethod");
778                        if (compressionNL != null && compressionNL.getLength() > 0) {
779                                compressionMethod = compressionNL.item(0).getFirstChild()
780                                                .getNodeValue();
781                                if (isDebugging) {
782                                        log.debug("Compression method is " + compressionMethod);
783                                }
784                                if (compressionMethod != null
785                                                && compressionMethod.equals(Entity.GZIP)) {
786                                        isGZipDataFile = true;
787                                } else if (compressionMethod != null
788                                                && compressionMethod.equals(Entity.ZIP)) {
789                                        isZipDataFile = true;
790                                }
791                        }
792
793                        // get encoding method info (mainly for tar file)
794                        NodeList encodingNL = xpathapi.selectNodeList(entity,
795                                        "physical/encodingMethod");
796                        if (encodingNL != null && encodingNL.getLength() > 0) {
797                                encodingMethod = encodingNL.item(0).getFirstChild()
798                                                .getNodeValue();
799                                if (isDebugging) {
800                                        log.debug("encoding method is " + encodingMethod);
801                                }
802                                if (encodingMethod != null && encodingMethod.equals(Entity.TAR)) {
803                                        isTarDataFile = true;
804                                }
805                        }
806
807                        if (entityOrientation.trim().equals("column")) {
808                                entityOrientation = Entity.COLUMNMAJOR;
809                        } else {
810                                entityOrientation = Entity.ROWMAJOR;
811                        }
812
813                        if (entityCaseSensitive.equals("yes")) {
814                                entityCaseSensitive = "true";
815                        } else {
816                                entityCaseSensitive = "false";
817                        }
818
819                        entityObject = new Entity(Integer.toString(elementId), entityName
820                                        .trim(), entityDescription.trim(), new Boolean(
821                                        entityCaseSensitive), entityOrientation, new Integer(
822                                        entityNumberOfRecords).intValue());
823                        entityObject.setNumHeaderLines((new Integer(numHeaderLines))
824                                        .intValue());
825                        entityObject.setNumFooterLines(numFooterLines);
826                        entityObject.setSimpleDelimited(isSimpleDelimited);
827                        // for simple dimited data file
828                        if (fieldDelimiter != null) {
829                                entityObject.setDelimiter(fieldDelimiter);
830                        }
831                        entityObject.setCollaplseDelimiter(isCollapseDelimiter);
832
833                        entityObject.setRecordDelimiter(recordDelimiter);
834                        entityObject.setURL(physicalFile);
835                        entityObject.setCompressionMethod(compressionMethod);
836                        entityObject.setIsImageEntity(isImageEntity);
837                        entityObject.setHasGZipDataFile(isGZipDataFile);
838                        entityObject.setHasZipDataFile(isZipDataFile);
839                        entityObject.setHasTarDataFile(isTarDataFile);
840                        entityObject.setDownloadable(isDownloadable);
841
842                        try {
843                                NodeList attNL = xpathapi.selectNodeList(entity,
844                                                "attributeList");
845                                processAttributeList(xpathapi, attNL, entityObject);
846                                entityObject.setDataFormatArray(formatArray);
847
848                        } catch (Exception e) {
849                                log.warn("Error parsing attributes: "
850                                                + e.getMessage()+ " So this entity "+entityObject.getName() +" may not have attribute list");
851                        }
852                        if (entityObject.isDownloadable()) {
853                                entityHash.put(Integer.toString(elementId), entityObject);
854                                entityList.add(entityObject);
855                                entityCounter++;
856                        }
857
858                        // fileHash.put(elementId, physicalFile);
859
860                }
861                numEntities = numEntities + entityCounter;
862
863        }
864
865        /*
866         * This method will digest a text fixed data format node and return a
867         * TextFixedDataFormat object.
868         */
869        private TextWidthFixedDataFormat handleTextFixedDataFormatNode(Node node)
870                        throws Exception {
871                TextWidthFixedDataFormat format = null;
872                if (node == null) {
873                        return format;
874                }
875                NodeList children = node.getChildNodes();
876                int length = children.getLength();
877                for (int i = 0; i < length; i++) {
878                        Node kid = children.item(i);
879                        String elementName = kid.getNodeName();
880                        if (elementName != null && elementName.equals("fieldWidth")) {
881                                String fieldWidthStr = kid.getFirstChild().getNodeValue();
882
883                                int fieldWidth = (new Integer(fieldWidthStr)).intValue();
884                                if (isDebugging) {
885                                        log.debug("The filed width for fix width in eml is "
886                                                        + fieldWidth);
887                                }
888                                format = new TextWidthFixedDataFormat(fieldWidth);
889                        } else if (elementName != null
890                                        && elementName.equals("fieldStartColumn") && format != null) {
891                                String startColumnStr = kid.getFirstChild().getNodeValue();
892                                int startColumn = (new Integer(startColumnStr)).intValue();
893                                if (isDebugging) {
894                                        log.debug("The start column is " + startColumn);
895                                }
896                                format.setFieldStartColumn(startColumn);
897                        } else if (elementName != null && elementName.equals("lineNumber")
898                                        && format != null) {
899                                String lineNumberStr = kid.getFirstChild().getNodeValue();
900                                int lineNumber = (new Integer(lineNumberStr)).intValue();
901                                if (isDebugging) {
902                                        log.debug("The start column is " + lineNumber);
903                                }
904                                format.setLineNumber(lineNumber);
905                        }
906                }
907                return format;
908        }
909
910        /*
911         * This method will digest a delimited data format node and return a
912         * DelimitedFixedFormat object.
913         */
914        private TextDelimitedDataFormat handleComplexDelimitedDataFormatNode(
915                        Node node) throws Exception {
916                TextDelimitedDataFormat format = null;
917                if (node == null) {
918                        return format;
919                }
920                NodeList children = node.getChildNodes();
921                int length = children.getLength();
922                Vector quoteList = new Vector();
923                for (int i = 0; i < length; i++) {
924                        Node kid = children.item(i);
925                        String elementName = kid.getNodeName();
926                        if (elementName != null && elementName.equals("fieldDelimiter")) {
927                                String fieldDelimiter = kid.getFirstChild().getNodeValue();
928                                if (isDebugging) {
929                                        log
930                                                        .debug("The filed delimiter for complex format in eml is "
931                                                                        + fieldDelimiter);
932                                }
933                                format = new TextDelimitedDataFormat(fieldDelimiter);
934                        } else if (elementName != null && elementName.equals("lineNumber")
935                                        && format != null) {
936                                String lineNumberStr = kid.getFirstChild().getNodeValue();
937                                int lineNumber = (new Integer(lineNumberStr)).intValue();
938                                if (isDebugging) {
939                                        log.debug("The line number is " + lineNumber);
940                                }
941                                format.setLineNumber(lineNumber);
942                        } else if (elementName != null
943                                        && elementName.equals("collapseDelimiter")
944                                        && format != null) {
945                                String collapse = kid.getFirstChild().getNodeValue();
946                                if (isDebugging) {
947                                        log.debug("The collapse delimiter " + collapse);
948                                }
949                                format.setCollapseDelimiter(collapse);
950                        } else if (elementName != null
951                                        && elementName.equals("quoteCharacter") && format != null) {
952                                String quote = kid.getFirstChild().getNodeValue();
953                                quoteList.add(quote);
954
955                        }
956
957                }
958                // set up quoteList
959                if (format != null) {
960                        int size = quoteList.size();
961                        String[] quoteArray = new String[size];
962                        for (int i = 0; i < size; i++) {
963                                quoteArray[i] = (String) quoteList.elementAt(i);
964                        }
965                        format.setQuoteCharater(quoteArray);
966                }
967                return format;
968        }
969}