001    /*
002     * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
003     *
004     * Project and contact information: http://www.cascading.org/
005     *
006     * This file is part of the Cascading project.
007     *
008     * Licensed under the Apache License, Version 2.0 (the "License");
009     * you may not use this file except in compliance with the License.
010     * You may obtain a copy of the License at
011     *
012     *     http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing, software
015     * distributed under the License is distributed on an "AS IS" BASIS,
016     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017     * See the License for the specific language governing permissions and
018     * limitations under the License.
019     */
020    
021    package cascading.operation.regex;
022    
023    import java.beans.ConstructorProperties;
024    import java.util.Arrays;
025    import java.util.regex.Matcher;
026    
027    import cascading.flow.FlowProcess;
028    import cascading.operation.Function;
029    import cascading.operation.FunctionCall;
030    import cascading.operation.OperationCall;
031    import cascading.operation.OperationException;
032    import cascading.tuple.Fields;
033    import cascading.tuple.Tuple;
034    import cascading.util.Pair;
035    
036    /**
037     * Class RegexParser is used to extract a matched regex from an incoming argument value.
038     * <p/>
039     * Sometimes its useful to parse out a value from a key/value pair in a string, if the key exists. If the key does
040     * not exist, returning an empty string instead of failing is typically expected.
041     * <p/>
042     * The following regex can extract a value from {@code key1=value1&key2=value2} if key1 exists, otherwise an
043     * empty string is returned:<br/>
044     * <pre>(?<=key1=)[^&]*|$</pre>
045     */
046    public class RegexParser extends RegexOperation<Pair<Matcher, Tuple>> implements Function<Pair<Matcher, Tuple>>
047      {
048      /** Field groups */
049      private int[] groups = null;
050    
051      /**
052       * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
053       * in a new Tuple.
054       * <p/>
055       * If the given patternString declares regular expression groups, each group will be returned as a value in the
056       * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
057       * <p/>
058       * The fields returned will be {@link Fields#UNKNOWN}, so a variable number of values may be emitted based on the
059       * regular expression given.
060       *
061       * @param patternString of type String
062       */
063      @ConstructorProperties({"patternString"})
064      public RegexParser( String patternString )
065        {
066        super( 1, patternString );
067        }
068    
069      /**
070       * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned
071       * as the given Field.
072       * <p/>
073       * If the given patternString declares regular expression groups, each group will be returned as a value in the
074       * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple.
075       * <p/>
076       * If the number of fields in the fieldDeclaration does not match the number of groups matched, an {@link OperationException}
077       * will be thrown during runtime.
078       * <p/>
079       * To overcome this, either use the constructors that take an array of groups, or use the {@code (?: ...)} sequence
080       * to tell the regular expression matcher to not capture the group.
081       *
082       * @param fieldDeclaration of type Fields
083       * @param patternString    of type String
084       */
085      @ConstructorProperties({"fieldDeclaration", "patternString"})
086      public RegexParser( Fields fieldDeclaration, String patternString )
087        {
088        super( 1, fieldDeclaration, patternString );
089        }
090    
091      /**
092       * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
093       * with match groups and whose groups designated by {@code groups} are stored in the appropriate number of new fields.
094       * <p/>
095       * The number of resulting fields will match the number of groups given ({@code groups.length}).
096       *
097       * @param patternString of type String
098       * @param groups        of type int[]
099       */
100      @ConstructorProperties({"patternString", "groups"})
101      public RegexParser( String patternString, int[] groups )
102        {
103        super( 1, Fields.size( verifyReturnLength( groups ) ), patternString );
104    
105        this.groups = Arrays.copyOf( groups, groups.length );
106        }
107    
108      private static int verifyReturnLength( int[] groups )
109        {
110        if( groups == null || groups.length == 0 )
111          throw new IllegalArgumentException( "groups may not be null or 0 length" );
112    
113        return groups.length;
114        }
115    
116      /**
117       * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression
118       * with match groups and whose groups designated by {@code groups} are stored in the named fieldDeclarations.
119       *
120       * @param fieldDeclaration of type Fields
121       * @param patternString    of type String
122       * @param groups           of type int[]
123       */
124      @ConstructorProperties({"fieldDeclaration", "patternString", "groups"})
125      public RegexParser( Fields fieldDeclaration, String patternString, int[] groups )
126        {
127        super( 1, fieldDeclaration, patternString );
128    
129        verifyReturnLength( groups );
130    
131        this.groups = Arrays.copyOf( groups, groups.length );
132    
133        if( !fieldDeclaration.isUnknown() && fieldDeclaration.size() != groups.length )
134          throw new IllegalArgumentException( "fieldDeclaration must equal number of groups to be captured, fields: " + fieldDeclaration.print() );
135        }
136    
137      public int[] getGroups()
138        {
139        if( groups == null )
140          return null;
141    
142        return Arrays.copyOf( groups, groups.length );
143        }
144    
145      @Override
146      public void prepare( FlowProcess flowProcess, OperationCall<Pair<Matcher, Tuple>> operationCall )
147        {
148        operationCall.setContext( new Pair<Matcher, Tuple>( getPattern().matcher( "" ), new Tuple() ) );
149        }
150    
151      @Override
152      public void operate( FlowProcess flowProcess, FunctionCall<Pair<Matcher, Tuple>> functionCall )
153        {
154        String value = functionCall.getArguments().getString( 0 );
155    
156        if( value == null )
157          value = "";
158    
159        Matcher matcher = functionCall.getContext().getLhs().reset( value );
160    
161        if( !matcher.find() )
162          throw new OperationException( "could not match pattern: [" + getPatternString() + "] with value: [" + value + "]" );
163    
164        Tuple output = functionCall.getContext().getRhs();
165    
166        output.clear();
167    
168        if( groups != null )
169          onGivenGroups( functionCall, matcher, output );
170        else
171          onFoundGroups( functionCall, matcher, output );
172        }
173    
174      private final void onFoundGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output )
175        {
176        int count = matcher.groupCount();
177    
178        if( count == 0 )
179          {
180          output.add( matcher.group( 0 ) );
181          }
182        else
183          {
184          for( int i = 0; i < count; i++ )
185            output.add( matcher.group( i + 1 ) ); // skip group 0
186          }
187    
188        functionCall.getOutputCollector().add( output );
189        }
190    
191      private final void onGivenGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output )
192        {
193        for( int pos : groups )
194          output.add( matcher.group( pos ) );
195    
196        functionCall.getOutputCollector().add( output );
197        }
198    
199      @Override
200      public boolean equals( Object object )
201        {
202        if( this == object )
203          return true;
204        if( !( object instanceof RegexParser ) )
205          return false;
206        if( !super.equals( object ) )
207          return false;
208    
209        RegexParser that = (RegexParser) object;
210    
211        if( !Arrays.equals( groups, that.groups ) )
212          return false;
213    
214        return true;
215        }
216    
217      @Override
218      public int hashCode()
219        {
220        int result = super.hashCode();
221        result = 31 * result + ( groups != null ? Arrays.hashCode( groups ) : 0 );
222        return result;
223        }
224      }