001    /*
002     * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.tap.hadoop;
022    
023    import java.beans.ConstructorProperties;
024    import java.io.IOException;
025    
026    import cascading.flow.FlowProcess;
027    import cascading.tap.BaseTemplateTap;
028    import cascading.tap.SinkMode;
029    import cascading.tap.Tap;
030    import cascading.tap.hadoop.io.TapOutputCollector;
031    import cascading.tuple.Fields;
032    import cascading.tuple.Tuple;
033    import cascading.tuple.TupleEntrySchemeCollector;
034    import org.apache.hadoop.mapred.JobConf;
035    import org.apache.hadoop.mapred.OutputCollector;
036    
037    /**
038     * Class TemplateTap can be used to write tuple streams out to sub-directories based on the values in the {@link Tuple}
039     * instance.
040     * <p/>
041     * The constructor takes a {@link Hfs} {@link cascading.tap.Tap} and a {@link java.util.Formatter} format syntax String. This allows
042     * Tuple values at given positions to be used as directory names. Note that Hadoop can only sink to directories, and
043     * all files in those directories are "part-xxxxx" files.
044     * <p/>
045     * {@code openTapsThreshold} limits the number of open files to be output to. This value defaults to 300 files.
046     * Each time the threshold is exceeded, 10% of the least recently used open files will be closed.
047     * <p/>
048     * TemplateTap will populate a given {@code pathTemplate} without regard to case of the values being used. Thus
049     * the resulting paths {@code 2012/June/} and {@code 2012/june/} will likely result in two open files into the same
050     * location. Forcing the case to be consistent with an upstream {@link cascading.operation.Function} is recommended, see
051     * {@link cascading.operation.expression.ExpressionFunction}.
052     * <p/>
053     * Though Hadoop has no mechanism to prevent simultaneous writes to a directory from multiple jobs, it doesn't mean
054     * its safe to do so. Same is true with the TemplateTap. Interleaving writes to a common parent (root) directory
055     * across multiple flows will very likely lead to data loss.
056     *
057     * @deprecated see {@link cascading.tap.hadoop.PartitionTap}
058     */
059    @Deprecated
060    public class TemplateTap extends BaseTemplateTap<JobConf, OutputCollector>
061      {
062      /**
063       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
064       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
065       *
066       * @param parent       of type Tap
067       * @param pathTemplate of type String
068       */
069      @ConstructorProperties({"parent", "pathTemplate"})
070      public TemplateTap( Hfs parent, String pathTemplate )
071        {
072        this( parent, pathTemplate, OPEN_TAPS_THRESHOLD_DEFAULT );
073        }
074    
075      /**
076       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
077       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
078       * <p/>
079       * {@code openTapsThreshold} limits the number of open files to be output to.
080       *
081       * @param parent            of type Hfs
082       * @param pathTemplate      of type String
083       * @param openTapsThreshold of type int
084       */
085      @ConstructorProperties({"parent", "pathTemplate", "openTapsThreshold"})
086      public TemplateTap( Hfs parent, String pathTemplate, int openTapsThreshold )
087        {
088        super( parent, pathTemplate, openTapsThreshold );
089        }
090    
091      /**
092       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
093       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
094       *
095       * @param parent       of type Tap
096       * @param pathTemplate of type String
097       * @param sinkMode     of type SinkMode
098       */
099      @ConstructorProperties({"parent", "pathTemplate", "sinkMode"})
100      public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode )
101        {
102        super( parent, pathTemplate, sinkMode );
103        }
104    
105      /**
106       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
107       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
108       * <p/>
109       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
110       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
111       *
112       * @param parent             of type Tap
113       * @param pathTemplate       of type String
114       * @param sinkMode           of type SinkMode
115       * @param keepParentOnDelete of type boolean
116       */
117      @ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete"})
118      public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete )
119        {
120        this( parent, pathTemplate, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT );
121        }
122    
123      /**
124       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
125       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
126       * <p/>
127       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
128       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
129       * <p/>
130       * {@code openTapsThreshold} limits the number of open files to be output to.
131       *
132       * @param parent             of type Tap
133       * @param pathTemplate       of type String
134       * @param sinkMode           of type SinkMode
135       * @param keepParentOnDelete of type boolean
136       * @param openTapsThreshold  of type int
137       */
138      @ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete", "openTapsThreshold"})
139      public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold )
140        {
141        super( parent, pathTemplate, sinkMode, keepParentOnDelete, openTapsThreshold );
142        }
143    
144      /**
145       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
146       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
147       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
148       * <p/>
149       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
150       * data not in the result file to be used in the template path name.
151       *
152       * @param parent       of type Tap
153       * @param pathTemplate of type String
154       * @param pathFields   of type Fields
155       */
156      @ConstructorProperties({"parent", "pathTemplate", "pathFields"})
157      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields )
158        {
159        this( parent, pathTemplate, pathFields, OPEN_TAPS_THRESHOLD_DEFAULT );
160        }
161    
162      /**
163       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
164       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
165       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
166       * <p/>
167       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
168       * data not in the result file to be used in the template path name.
169       * <p/>
170       * {@code openTapsThreshold} limits the number of open files to be output to.
171       *
172       * @param parent            of type Hfs
173       * @param pathTemplate      of type String
174       * @param pathFields        of type Fields
175       * @param openTapsThreshold of type int
176       */
177      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "openTapsThreshold"})
178      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, int openTapsThreshold )
179        {
180        super( parent, pathTemplate, pathFields, openTapsThreshold );
181        }
182    
183      /**
184       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
185       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
186       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
187       * <p/>
188       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
189       * data not in the result file to be used in the template path name.
190       *
191       * @param parent       of type Tap
192       * @param pathTemplate of type String
193       * @param pathFields   of type Fields
194       * @param sinkMode     of type SinkMode
195       */
196      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode"})
197      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode )
198        {
199        super( parent, pathTemplate, pathFields, sinkMode );
200        }
201    
202      /**
203       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
204       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
205       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
206       * <p/>
207       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
208       * data not in the result file to be used in the template path name.
209       * <p/>
210       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
211       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
212       *
213       * @param parent             of type Tap
214       * @param pathTemplate       of type String
215       * @param pathFields         of type Fields
216       * @param sinkMode           of type SinkMode
217       * @param keepParentOnDelete of type boolean
218       */
219      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete"})
220      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete )
221        {
222        this( parent, pathTemplate, pathFields, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT );
223        }
224    
225      /**
226       * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
227       * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
228       * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
229       * <p/>
230       * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
231       * data not in the result file to be used in the template path name.
232       * <p/>
233       * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)}
234       * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
235       * <p/>
236       * {@code openTapsThreshold} limits the number of open files to be output to.
237       *
238       * @param parent             of type Hfs
239       * @param pathTemplate       of type String
240       * @param pathFields         of type Fields
241       * @param sinkMode           of type SinkMode
242       * @param keepParentOnDelete of type boolean
243       * @param openTapsThreshold  of type int
244       */
245      @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete",
246                              "openTapsThreshold"})
247      public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold )
248        {
249        super( parent, pathTemplate, pathFields, sinkMode, keepParentOnDelete, openTapsThreshold );
250        }
251    
252      @Override
253      protected TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<JobConf> flowProcess, Tap parent, String path ) throws IOException
254        {
255        TapOutputCollector outputCollector = new TapOutputCollector( flowProcess, parent, path );
256    
257        return new TupleEntrySchemeCollector<JobConf, OutputCollector>( flowProcess, parent, outputCollector );
258        }
259      }