001 /* 002 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. 003 * 004 * Project and contact information: http://www.cascading.org/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021 package cascading.operation.regex; 022 023 import java.beans.ConstructorProperties; 024 import java.util.Arrays; 025 import java.util.regex.Matcher; 026 027 import cascading.flow.FlowProcess; 028 import cascading.operation.Function; 029 import cascading.operation.FunctionCall; 030 import cascading.operation.OperationCall; 031 import cascading.operation.OperationException; 032 import cascading.tuple.Fields; 033 import cascading.tuple.Tuple; 034 import cascading.util.Pair; 035 036 /** 037 * Class RegexParser is used to extract a matched regex from an incoming argument value. 038 * <p/> 039 * Sometimes its useful to parse out a value from a key/value pair in a string, if the key exists. If the key does 040 * not exist, returning an empty string instead of failing is typically expected. 041 * <p/> 042 * The following regex can extract a value from {@code key1=value1&key2=value2} if key1 exists, otherwise an 043 * empty string is returned:<br/> 044 * <pre>(?<=key1=)[^&]*|$</pre> 045 */ 046 public class RegexParser extends RegexOperation<Pair<Matcher, Tuple>> implements Function<Pair<Matcher, Tuple>> 047 { 048 /** Field groups */ 049 private int[] groups = null; 050 051 /** 052 * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned 053 * in a new Tuple. 054 * <p/> 055 * If the given patternString declares regular expression groups, each group will be returned as a value in the 056 * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple. 057 * <p/> 058 * The fields returned will be {@link Fields#UNKNOWN}, so a variable number of values may be emitted based on the 059 * regular expression given. 060 * 061 * @param patternString of type String 062 */ 063 @ConstructorProperties({"patternString"}) 064 public RegexParser( String patternString ) 065 { 066 super( 1, patternString ); 067 } 068 069 /** 070 * Constructor RegexParser creates a new RegexParser instance, where the argument Tuple value is matched and returned 071 * as the given Field. 072 * <p/> 073 * If the given patternString declares regular expression groups, each group will be returned as a value in the 074 * resulting Tuple. If no groups are declared, the match will be returned as the only value in the resulting Tuple. 075 * <p/> 076 * If the number of fields in the fieldDeclaration does not match the number of groups matched, an {@link OperationException} 077 * will be thrown during runtime. 078 * <p/> 079 * To overcome this, either use the constructors that take an array of groups, or use the {@code (?: ...)} sequence 080 * to tell the regular expression matcher to not capture the group. 081 * 082 * @param fieldDeclaration of type Fields 083 * @param patternString of type String 084 */ 085 @ConstructorProperties({"fieldDeclaration", "pattenString"}) 086 public RegexParser( Fields fieldDeclaration, String patternString ) 087 { 088 super( 1, fieldDeclaration, patternString ); 089 } 090 091 /** 092 * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression 093 * with match groups and whose groups designated by {@code groups} are stored in the appropriate number of new fields. 094 * <p/> 095 * The number of resulting fields will match the number of groups given ({@code groups.length}). 096 * 097 * @param patternString of type String 098 * @param groups of type int[] 099 */ 100 @ConstructorProperties({"patternString", "groups"}) 101 public RegexParser( String patternString, int[] groups ) 102 { 103 super( 1, Fields.size( verifyReturnLength( groups ) ), patternString ); 104 105 this.groups = Arrays.copyOf( groups, groups.length ); 106 } 107 108 private static int verifyReturnLength( int[] groups ) 109 { 110 if( groups == null || groups.length == 0 ) 111 throw new IllegalArgumentException( "groups may not be null or 0 length" ); 112 113 return groups.length; 114 } 115 116 /** 117 * Constructor RegexParser creates a new RegexParser instance, where the patternString is a regular expression 118 * with match groups and whose groups designated by {@code groups} are stored in the named fieldDeclarations. 119 * 120 * @param fieldDeclaration of type Fields 121 * @param patternString of type String 122 * @param groups of type int[] 123 */ 124 @ConstructorProperties({"fieldDeclaration", "patternString", "groups"}) 125 public RegexParser( Fields fieldDeclaration, String patternString, int[] groups ) 126 { 127 super( 1, fieldDeclaration, patternString ); 128 129 verifyReturnLength( groups ); 130 131 this.groups = Arrays.copyOf( groups, groups.length ); 132 133 if( !fieldDeclaration.isUnknown() && fieldDeclaration.size() != groups.length ) 134 throw new IllegalArgumentException( "fieldDeclaration must equal number of groups to be captured, fields: " + fieldDeclaration.print() ); 135 } 136 137 public int[] getGroups() 138 { 139 if( groups == null ) 140 return null; 141 142 return Arrays.copyOf( groups, groups.length ); 143 } 144 145 @Override 146 public void prepare( FlowProcess flowProcess, OperationCall<Pair<Matcher, Tuple>> operationCall ) 147 { 148 operationCall.setContext( new Pair<Matcher, Tuple>( getPattern().matcher( "" ), new Tuple() ) ); 149 } 150 151 @Override 152 public void operate( FlowProcess flowProcess, FunctionCall<Pair<Matcher, Tuple>> functionCall ) 153 { 154 String value = functionCall.getArguments().getString( 0 ); 155 156 if( value == null ) 157 value = ""; 158 159 Matcher matcher = functionCall.getContext().getLhs().reset( value ); 160 161 if( !matcher.find() ) 162 throw new OperationException( "could not match pattern: [" + getPatternString() + "] with value: [" + value + "]" ); 163 164 Tuple output = functionCall.getContext().getRhs(); 165 166 output.clear(); 167 168 if( groups != null ) 169 onGivenGroups( functionCall, matcher, output ); 170 else 171 onFoundGroups( functionCall, matcher, output ); 172 } 173 174 private final void onFoundGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output ) 175 { 176 int count = matcher.groupCount(); 177 178 if( count == 0 ) 179 { 180 output.add( matcher.group( 0 ) ); 181 } 182 else 183 { 184 for( int i = 0; i < count; i++ ) 185 output.add( matcher.group( i + 1 ) ); // skip group 0 186 } 187 188 functionCall.getOutputCollector().add( output ); 189 } 190 191 private final void onGivenGroups( FunctionCall<Pair<Matcher, Tuple>> functionCall, Matcher matcher, Tuple output ) 192 { 193 for( int pos : groups ) 194 output.add( matcher.group( pos ) ); 195 196 functionCall.getOutputCollector().add( output ); 197 } 198 199 @Override 200 public boolean equals( Object object ) 201 { 202 if( this == object ) 203 return true; 204 if( !( object instanceof RegexParser ) ) 205 return false; 206 if( !super.equals( object ) ) 207 return false; 208 209 RegexParser that = (RegexParser) object; 210 211 if( !Arrays.equals( groups, that.groups ) ) 212 return false; 213 214 return true; 215 } 216 217 @Override 218 public int hashCode() 219 { 220 int result = super.hashCode(); 221 result = 31 * result + ( groups != null ? Arrays.hashCode( groups ) : 0 ); 222 return result; 223 } 224 }