Source code

001/* An operator for file sinks.
002 * 
003 * Copyright (c) 2014 The Regents of the University of California.
004 * All rights reserved.
005 *
006 * '$Author: crawl $'
007 * '$Date: 2015-09-03 18:41:53 +0000 (Thu, 03 Sep 2015) $' 
008 * '$Revision: 33858 $'
009 * 
010 * Permission is hereby granted, without written agreement and without
011 * license or royalty fees, to use, copy, modify, and distribute this
012 * software and its documentation for any purpose, provided that the above
013 * copyright notice and the following two paragraphs appear in all copies
014 * of this software.
015 *
016 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
017 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
018 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
019 * THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
020 * SUCH DAMAGE.
021 *
022 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
023 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
024 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
025 * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
026 * CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
027 * ENHANCEMENTS, OR MODIFICATIONS.
028 *
029 */
030package org.kepler.spark.operator;
031
032import java.net.URI;
033
034import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
035import org.apache.spark.api.java.JavaPairRDD;
036import org.kepler.spark.util.SparkUtilities;
037
038/** A data sink that writes to files.
039 * 
040 *  @author Daniel Crawl
041 *  @version $Id: FileDataSink.java 33858 2015-09-03 18:41:53Z crawl $
042 */
043public class FileDataSink extends DataSink {
044
045    /** Create a new FileDataSink.
046     *  @param outputFormatClass the format class to write the file
047     *  @param path the output file path
048     *  @param name the name of the operator  
049     */
050    public FileDataSink(Class<? extends FileOutputFormat<?,?>> outputFormatClass, URI path, String name) {
051        super(1, outputFormatClass, name);
052        _path = path;        
053    }
054    
055    /** Execute the operator. */
056    @Override
057    public JavaPairRDD<Object, ?> execute() {
058        
059        // delete the output path if it exists.
060        SparkUtilities.deleteDirectoryOutput(_path, _configuration);
061        
062        _inputData[0].saveAsNewAPIHadoopFile(_path.toString(), Object.class, Object.class,
063                (Class<? extends FileOutputFormat>) _stub, _configuration);
064        return null;
065    }
066    
067    ///////////////////////////////////////////////////////////////////
068    ////                      private fields                     //////
069
070    /** The path of the output file. */
071    private URI _path;
072
073}