001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.tap.hadoop; 022 023 import java.beans.ConstructorProperties; 024 import java.io.IOException; 025 026 import cascading.flow.FlowProcess; 027 import cascading.tap.BaseTemplateTap; 028 import cascading.tap.SinkMode; 029 import cascading.tap.Tap; 030 import cascading.tap.hadoop.io.TapOutputCollector; 031 import cascading.tuple.Fields; 032 import cascading.tuple.Tuple; 033 import cascading.tuple.TupleEntrySchemeCollector; 034 import org.apache.hadoop.mapred.JobConf; 035 import org.apache.hadoop.mapred.OutputCollector; 036 037 /** 038 * Class TemplateTap can be used to write tuple streams out to sub-directories based on the values in the {@link Tuple} 039 * instance. 040 * <p/> 041 * The constructor takes a {@link Hfs} {@link cascading.tap.Tap} and a {@link java.util.Formatter} format syntax String. This allows 042 * Tuple values at given positions to be used as directory names. Note that Hadoop can only sink to directories, and 043 * all files in those directories are "part-xxxxx" files. 044 * <p/> 045 * {@code openTapsThreshold} limits the number of open files to be output to. This value defaults to 300 files. 046 * Each time the threshold is exceeded, 10% of the least recently used open files will be closed. 047 * <p/> 048 * TemplateTap will populate a given {@code pathTemplate} without regard to case of the values being used. Thus 049 * the resulting paths {@code 2012/June/} and {@code 2012/june/} will likely result in two open files into the same 050 * location. Forcing the case to be consistent with an upstream {@link cascading.operation.Function} is recommended, see 051 * {@link cascading.operation.expression.ExpressionFunction}. 052 * <p/> 053 * Though Hadoop has no mechanism to prevent simultaneous writes to a directory from multiple jobs, it doesn't mean 054 * its safe to do so. Same is true with the TemplateTap. Interleaving writes to a common parent (root) directory 055 * across multiple flows will very likely lead to data loss. 056 * 057 * @deprecated see {@link cascading.tap.hadoop.PartitionTap} 058 */ 059 @Deprecated 060 public class TemplateTap extends BaseTemplateTap<JobConf, OutputCollector> 061 { 062 /** 063 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 064 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 065 * 066 * @param parent of type Tap 067 * @param pathTemplate of type String 068 */ 069 @ConstructorProperties({"parent", "pathTemplate"}) 070 public TemplateTap( Hfs parent, String pathTemplate ) 071 { 072 this( parent, pathTemplate, OPEN_TAPS_THRESHOLD_DEFAULT ); 073 } 074 075 /** 076 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 077 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 078 * <p/> 079 * {@code openTapsThreshold} limits the number of open files to be output to. 080 * 081 * @param parent of type Hfs 082 * @param pathTemplate of type String 083 * @param openTapsThreshold of type int 084 */ 085 @ConstructorProperties({"parent", "pathTemplate", "openTapsThreshold"}) 086 public TemplateTap( Hfs parent, String pathTemplate, int openTapsThreshold ) 087 { 088 super( parent, pathTemplate, openTapsThreshold ); 089 } 090 091 /** 092 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 093 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 094 * 095 * @param parent of type Tap 096 * @param pathTemplate of type String 097 * @param sinkMode of type SinkMode 098 */ 099 @ConstructorProperties({"parent", "pathTemplate", "sinkMode"}) 100 public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode ) 101 { 102 super( parent, pathTemplate, sinkMode ); 103 } 104 105 /** 106 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 107 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 108 * <p/> 109 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 110 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 111 * 112 * @param parent of type Tap 113 * @param pathTemplate of type String 114 * @param sinkMode of type SinkMode 115 * @param keepParentOnDelete of type boolean 116 */ 117 @ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete"}) 118 public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete ) 119 { 120 this( parent, pathTemplate, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT ); 121 } 122 123 /** 124 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 125 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 126 * <p/> 127 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 128 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 129 * <p/> 130 * {@code openTapsThreshold} limits the number of open files to be output to. 131 * 132 * @param parent of type Tap 133 * @param pathTemplate of type String 134 * @param sinkMode of type SinkMode 135 * @param keepParentOnDelete of type boolean 136 * @param openTapsThreshold of type int 137 */ 138 @ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete", "openTapsThreshold"}) 139 public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold ) 140 { 141 super( parent, pathTemplate, sinkMode, keepParentOnDelete, openTapsThreshold ); 142 } 143 144 /** 145 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 146 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 147 * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate. 148 * <p/> 149 * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing 150 * data not in the result file to be used in the template path name. 151 * 152 * @param parent of type Tap 153 * @param pathTemplate of type String 154 * @param pathFields of type Fields 155 */ 156 @ConstructorProperties({"parent", "pathTemplate", "pathFields"}) 157 public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields ) 158 { 159 this( parent, pathTemplate, pathFields, OPEN_TAPS_THRESHOLD_DEFAULT ); 160 } 161 162 /** 163 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 164 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 165 * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate. 166 * <p/> 167 * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing 168 * data not in the result file to be used in the template path name. 169 * <p/> 170 * {@code openTapsThreshold} limits the number of open files to be output to. 171 * 172 * @param parent of type Hfs 173 * @param pathTemplate of type String 174 * @param pathFields of type Fields 175 * @param openTapsThreshold of type int 176 */ 177 @ConstructorProperties({"parent", "pathTemplate", "pathFields", "openTapsThreshold"}) 178 public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, int openTapsThreshold ) 179 { 180 super( parent, pathTemplate, pathFields, openTapsThreshold ); 181 } 182 183 /** 184 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 185 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 186 * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate. 187 * <p/> 188 * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing 189 * data not in the result file to be used in the template path name. 190 * 191 * @param parent of type Tap 192 * @param pathTemplate of type String 193 * @param pathFields of type Fields 194 * @param sinkMode of type SinkMode 195 */ 196 @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode"}) 197 public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode ) 198 { 199 super( parent, pathTemplate, pathFields, sinkMode ); 200 } 201 202 /** 203 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 204 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 205 * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate. 206 * <p/> 207 * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing 208 * data not in the result file to be used in the template path name. 209 * <p/> 210 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 211 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 212 * 213 * @param parent of type Tap 214 * @param pathTemplate of type String 215 * @param pathFields of type Fields 216 * @param sinkMode of type SinkMode 217 * @param keepParentOnDelete of type boolean 218 */ 219 @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete"}) 220 public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete ) 221 { 222 this( parent, pathTemplate, pathFields, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT ); 223 } 224 225 /** 226 * Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the 227 * base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String. 228 * The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate. 229 * <p/> 230 * This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing 231 * data not in the result file to be used in the template path name. 232 * <p/> 233 * {@code keepParentOnDelete}, when set to true, prevents the parent Tap from being deleted when {@link #deleteResource(Object)} 234 * is called, typically an issue when used inside a {@link cascading.cascade.Cascade}. 235 * <p/> 236 * {@code openTapsThreshold} limits the number of open files to be output to. 237 * 238 * @param parent of type Hfs 239 * @param pathTemplate of type String 240 * @param pathFields of type Fields 241 * @param sinkMode of type SinkMode 242 * @param keepParentOnDelete of type boolean 243 * @param openTapsThreshold of type int 244 */ 245 @ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete", 246 "openTapsThreshold"}) 247 public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold ) 248 { 249 super( parent, pathTemplate, pathFields, sinkMode, keepParentOnDelete, openTapsThreshold ); 250 } 251 252 @Override 253 protected TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<JobConf> flowProcess, Tap parent, String path ) throws IOException 254 { 255 TapOutputCollector outputCollector = new TapOutputCollector( flowProcess, parent, path ); 256 257 return new TupleEntrySchemeCollector<JobConf, OutputCollector>( flowProcess, parent, outputCollector ); 258 } 259 }