Source code

001/* Utilities for Spark.
002 * 
003 * Copyright (c) 2015 The Regents of the University of California.
004 * All rights reserved.
005 *
006 * '$Author: crawl $'
007 * '$Date: 2015-09-03 18:41:53 +0000 (Thu, 03 Sep 2015) $' 
008 * '$Revision: 33858 $'
009 * 
010 * Permission is hereby granted, without written agreement and without
011 * license or royalty fees, to use, copy, modify, and distribute this
012 * software and its documentation for any purpose, provided that the above
013 * copyright notice and the following two paragraphs appear in all copies
014 * of this software.
015 *
016 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
017 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
018 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
019 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
020 * SUCH DAMAGE.
021 *
022 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
023 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
024 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
025 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
026 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
027 * ENHANCEMENTS, OR MODIFICATIONS.
028 *
029 */
030package org.kepler.spark.util;
031
032import java.io.File;
033import java.io.IOException;
034import java.net.URI;
035
036import org.apache.commons.io.FileUtils;
037import org.apache.hadoop.conf.Configuration;
038import org.apache.hadoop.fs.FileStatus;
039import org.apache.hadoop.fs.FileSystem;
040import org.apache.hadoop.fs.Path;
041import org.apache.hadoop.fs.PathFilter;
042
043import ptolemy.util.MessageHandler;
044
045/** Utilities for Spark.
046 * 
047 *  @author Daniel Crawl
048 *  @version $Id: SparkUtilities.java 33858 2015-09-03 18:41:53Z crawl $
049 */
050
051public class SparkUtilities {
052
053    /** This class cannot be instantiated. */
054    private SparkUtilities() { }
055
056    /** Delete an output directory. This method removes files like
057     *  "SUCCESS", "part-r-0000", etc., that are created as output from
058     *  a Spark job. 
059     */
060    public static void deleteDirectoryOutput(URI path, Configuration configuration) {
061        try {
062            Path outputFile = new Path(path);
063            try(FileSystem fileSystem = outputFile.getFileSystem(configuration);) {
064                if(fileSystem.exists(outputFile)) {
065                    if(fileSystem.isDirectory(outputFile)) {
066                        FileStatus[] toDelete = fileSystem.listStatus(outputFile, new PathFilter() {
067                            @Override
068                            public boolean accept(Path pathname) {
069                                String name = pathname.getName();
070                                return name.contains("SUCCESS") ||
071                                        name.contains("crc") ||
072                                        name.contains("metadata") ||
073                                        name.contains("part");
074                            }
075                        });
076                        for(FileStatus f : toDelete) {
077                            if(!fileSystem.delete(f.getPath(), false)) {
078                                System.err.println("WARNING: could not delete " + f);
079                            }
080                        }
081                        FileUtils.deleteDirectory(new File(new File(path), "_temporary"));
082                        //fileSystem.delete(new Path(outputFile, "_temporary" + File.separator + "0"), false);
083                        //fileSystem.delete(new Path(outputFile, "_temporary"), false);
084                    }
085                    if(!fileSystem.delete(outputFile, false)) {
086                        System.err.println("WARNING: could not delete " + outputFile);
087                    }
088                }
089            }
090        } catch(IOException e) {
091            MessageHandler.error("Error deleting existing output.", e);
092        }        
093    }
094}