001/*
002 * Copyright (c) 2007-2016 Concurrent, Inc. All Rights Reserved.
003 *
004 * Project and contact information: http://www.cascading.org/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading.operation.regex;
022
023import java.beans.ConstructorProperties;
024import java.util.regex.Pattern;
025
026import cascading.flow.FlowProcess;
027import cascading.operation.Function;
028import cascading.operation.FunctionCall;
029import cascading.operation.OperationCall;
030import cascading.tuple.Fields;
031import cascading.tuple.Tuple;
032import cascading.util.Pair;
033
034/**
035 * Class RegexSplitter will split an incoming argument value by the given regex delimiter patternString.
036 * <p/>
037 * RegexSplitter only expects one field value. If more than one argument value is passed, only the
038 * first is handled, the remainder are ignored.
039 * <p/>
040 * Note a {@code null} valued argument passed to the parser will be converted to an empty string ({@code ""}) before
041 * the regex is applied.
042 * <p/>
043 * Any Object value will be coerced to a String type if type information is provided. See the
044 * {@link cascading.tuple.type.CoercibleType} interface to control how custom Object types are converted to String
045 * values.
046 */
047public class RegexSplitter extends RegexOperation<Pair<Pattern, Tuple>> implements Function<Pair<Pattern, Tuple>>
048  {
049  private final int length;
050
051  /**
052   * Constructor RegexSplitter creates a new RegexSplitter instance.
053   *
054   * @param patternString of type String
055   */
056  @ConstructorProperties({"patternString"})
057  public RegexSplitter( String patternString )
058    {
059    super( 1, patternString );
060    length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
061    }
062
063  /**
064   * Constructor RegexOperation creates a new RegexOperation instance, where the delimiter is the tab character.
065   *
066   * @param fieldDeclaration of type Fields
067   */
068  @ConstructorProperties({"fieldDeclaration"})
069  public RegexSplitter( Fields fieldDeclaration )
070    {
071    super( 1, fieldDeclaration, "\t" );
072    length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
073    }
074
075  /**
076   * Constructor RegexSplitter creates a new RegexSplitter instance.
077   *
078   * @param fieldDeclaration of type Fields
079   * @param patternString    of type String
080   */
081  @ConstructorProperties({"fieldDeclaration", "patternString"})
082  public RegexSplitter( Fields fieldDeclaration, String patternString )
083    {
084    super( 1, fieldDeclaration, patternString );
085    length = fieldDeclaration.isUnknown() ? -1 : fieldDeclaration.size();
086    }
087
088  @Override
089  public void prepare( FlowProcess flowProcess, OperationCall<Pair<Pattern, Tuple>> operationCall )
090    {
091    operationCall.setContext( new Pair<Pattern, Tuple>( getPattern(), new Tuple() ) );
092    }
093
094  @Override
095  public void operate( FlowProcess flowProcess, FunctionCall<Pair<Pattern, Tuple>> functionCall )
096    {
097    String value = functionCall.getArguments().getString( 0 );
098
099    if( value == null )
100      value = "";
101
102    Tuple output = functionCall.getContext().getRhs();
103
104    output.clear();
105
106    String[] split = functionCall.getContext().getLhs().split( value, length );
107
108    for( int i = 0; i < split.length; i++ )
109      output.add( split[ i ] );
110
111    functionCall.getOutputCollector().add( output );
112    }
113  }